aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--net/netfilter/Kconfig15
-rw-r--r--net/netfilter/Makefile15
-rw-r--r--net/netfilter/core.c49
-rw-r--r--net/netfilter/ipset/ip_set_core.c12
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h30
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c34
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c20
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c6
-rw-r--r--net/netfilter/nf_conncount.c11
-rw-r--r--net/netfilter/nf_conntrack_acct.c19
-rw-r--r--net/netfilter/nf_conntrack_bpf.c513
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c6
-rw-r--r--net/netfilter/nf_conntrack_core.c652
-rw-r--r--net/netfilter/nf_conntrack_ecache.c225
-rw-r--r--net/netfilter/nf_conntrack_expect.c6
-rw-r--r--net/netfilter/nf_conntrack_extend.c148
-rw-r--r--net/netfilter/nf_conntrack_ftp.c20
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c270
-rw-r--r--net/netfilter/nf_conntrack_helper.c98
-rw-r--r--net/netfilter/nf_conntrack_irc.c51
-rw-r--r--net/netfilter/nf_conntrack_labels.c20
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c5
-rw-r--r--net/netfilter/nf_conntrack_netlink.c257
-rw-r--r--net/netfilter/nf_conntrack_pptp.c60
-rw-r--r--net/netfilter/nf_conntrack_proto.c10
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c9
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c423
-rw-r--r--net/netfilter/nf_conntrack_sane.c68
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c16
-rw-r--r--net/netfilter/nf_conntrack_sip.c13
-rw-r--r--net/netfilter/nf_conntrack_standalone.c21
-rw-r--r--net/netfilter/nf_conntrack_timeout.c71
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c20
-rw-r--r--net/netfilter/nf_dup_netdev.c27
-rw-r--r--net/netfilter/nf_flow_table_core.c172
-rw-r--r--net/netfilter/nf_flow_table_inet.c43
-rw-r--r--net/netfilter/nf_flow_table_ip.c103
-rw-r--r--net/netfilter/nf_flow_table_offload.c64
-rw-r--r--net/netfilter/nf_flow_table_procfs.c80
-rw-r--r--net/netfilter/nf_log.c4
-rw-r--r--net/netfilter/nf_log_syslog.c144
-rw-r--r--net/netfilter/nf_nat_amanda.c14
-rw-r--r--net/netfilter/nf_nat_bpf.c79
-rw-r--r--net/netfilter/nf_nat_core.c49
-rw-r--r--net/netfilter/nf_nat_ftp.c17
-rw-r--r--net/netfilter/nf_nat_helper.c31
-rw-r--r--net/netfilter/nf_nat_irc.c16
-rw-r--r--net/netfilter/nf_nat_masquerade.c9
-rw-r--r--net/netfilter/nf_nat_sip.c14
-rw-r--r--net/netfilter/nf_queue.c36
-rw-r--r--net/netfilter/nf_synproxy_core.c27
-rw-r--r--net/netfilter/nf_tables_api.c853
-rw-r--r--net/netfilter/nf_tables_core.c134
-rw-r--r--net/netfilter/nf_tables_offload.c26
-rw-r--r--net/netfilter/nf_tables_trace.c46
-rw-r--r--net/netfilter/nfnetlink.c88
-rw-r--r--net/netfilter/nfnetlink_cthelper.c10
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c73
-rw-r--r--net/netfilter/nfnetlink_hook.c8
-rw-r--r--net/netfilter/nfnetlink_log.c14
-rw-r--r--net/netfilter/nfnetlink_osf.c4
-rw-r--r--net/netfilter/nfnetlink_queue.c57
-rw-r--r--net/netfilter/nft_bitwise.c172
-rw-r--r--net/netfilter/nft_byteorder.c14
-rw-r--r--net/netfilter/nft_cmp.c161
-rw-r--r--net/netfilter/nft_compat.c10
-rw-r--r--net/netfilter/nft_connlimit.c38
-rw-r--r--net/netfilter/nft_counter.c61
-rw-r--r--net/netfilter/nft_ct.c61
-rw-r--r--net/netfilter/nft_dup_netdev.c7
-rw-r--r--net/netfilter/nft_dynset.c3
-rw-r--r--net/netfilter/nft_exthdr.c141
-rw-r--r--net/netfilter/nft_fib.c46
-rw-r--r--net/netfilter/nft_fib_inet.c1
-rw-r--r--net/netfilter/nft_fib_netdev.c1
-rw-r--r--net/netfilter/nft_flow_offload.c52
-rw-r--r--net/netfilter/nft_fwd_netdev.c17
-rw-r--r--net/netfilter/nft_hash.c36
-rw-r--r--net/netfilter/nft_immediate.c46
-rw-r--r--net/netfilter/nft_last.c70
-rw-r--r--net/netfilter/nft_limit.c194
-rw-r--r--net/netfilter/nft_log.c1
-rw-r--r--net/netfilter/nft_lookup.c12
-rw-r--r--net/netfilter/nft_masq.c3
-rw-r--r--net/netfilter/nft_meta.c60
-rw-r--r--net/netfilter/nft_nat.c5
-rw-r--r--net/netfilter/nft_numgen.c68
-rw-r--r--net/netfilter/nft_objref.c2
-rw-r--r--net/netfilter/nft_osf.c47
-rw-r--r--net/netfilter/nft_payload.c100
-rw-r--r--net/netfilter/nft_queue.c29
-rw-r--r--net/netfilter/nft_quota.c53
-rw-r--r--net/netfilter/nft_range.c28
-rw-r--r--net/netfilter/nft_redir.c3
-rw-r--r--net/netfilter/nft_reject_inet.c1
-rw-r--r--net/netfilter/nft_reject_netdev.c2
-rw-r--r--net/netfilter/nft_rt.c1
-rw-r--r--net/netfilter/nft_set_bitmap.c4
-rw-r--r--net/netfilter/nft_set_hash.c2
-rw-r--r--net/netfilter/nft_set_pipapo.c56
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c4
-rw-r--r--net/netfilter/nft_set_rbtree.c6
-rw-r--r--net/netfilter/nft_socket.c104
-rw-r--r--net/netfilter/nft_synproxy.c5
-rw-r--r--net/netfilter/nft_tproxy.c15
-rw-r--r--net/netfilter/nft_tunnel.c32
-rw-r--r--net/netfilter/nft_xfrm.c36
-rw-r--r--net/netfilter/x_tables.c30
-rw-r--r--net/netfilter/xt_CT.c26
-rw-r--r--net/netfilter/xt_DSCP.c8
-rw-r--r--net/netfilter/xt_RATEEST.c2
-rw-r--r--net/netfilter/xt_TCPMSS.c4
-rw-r--r--net/netfilter/xt_TPROXY.c25
-rw-r--r--net/netfilter/xt_connlimit.c6
-rw-r--r--net/netfilter/xt_hashlimit.c18
-rw-r--r--net/netfilter/xt_recent.c4
-rw-r--r--net/netfilter/xt_socket.c4
-rw-r--r--net/netfilter/xt_statistic.c2
122 files changed, 4955 insertions, 2442 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 3646fc195e7d..4b8d04640ff3 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -144,7 +144,6 @@ config NF_CONNTRACK_ZONES
config NF_CONNTRACK_PROCFS
bool "Supply CT list in procfs (OBSOLETE)"
- default y
depends on PROC_FS
help
This option enables for the list of known conntrack entries
@@ -515,12 +514,6 @@ config NFT_FLOW_OFFLOAD
This option adds the "flow_offload" expression that you can use to
choose what flows are placed into the hardware.
-config NFT_COUNTER
- tristate "Netfilter nf_tables counter module"
- help
- This option adds the "counter" expression that you can use to
- include packet and byte counters in a rule.
-
config NFT_CONNLIMIT
tristate "Netfilter nf_tables connlimit module"
depends on NF_CONNTRACK
@@ -740,6 +733,14 @@ config NF_FLOW_TABLE
To compile it as a module, choose M here.
+config NF_FLOW_TABLE_PROCFS
+ bool "Supply flow table statistics in procfs"
+ depends on NF_FLOW_TABLE
+ depends on PROC_FS
+ help
+ This option enables for the flow table offload statistics
+ to be shown in procfs under net/netfilter/nf_flowtable.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index aab20e575ecd..0f060d100880 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -14,6 +14,11 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -55,6 +60,12 @@ obj-$(CONFIG_NF_NAT) += nf_nat.o
nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+ifeq ($(CONFIG_NF_NAT),m)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
+else ifeq ($(CONFIG_NF_NAT),y)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o
+endif
+
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -75,7 +86,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
- nft_chain_route.o nf_tables_offload.o \
+ nft_counter.o nft_chain_route.o nf_tables_offload.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
nft_set_pipapo.o
@@ -100,7 +111,6 @@ obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o
obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
-obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
@@ -124,6 +134,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \
nf_flow_table_offload.o
+nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 6dec9cd395f1..5a6705a0e4ec 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -58,7 +58,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
if (num == 0)
return NULL;
- e = kvzalloc(alloc, GFP_KERNEL);
+ e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
if (e)
e->num_hook_entries = num;
return e;
@@ -300,12 +300,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
return NULL;
return net->nf.hooks_ipv6 + hooknum;
-#if IS_ENABLED(CONFIG_DECNET)
- case NFPROTO_DECNET:
- if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
- return NULL;
- return net->nf.hooks_decnet + hooknum;
-#endif
default:
WARN_ON_ONCE(1);
return NULL;
@@ -428,14 +422,15 @@ static int __nf_register_net_hook(struct net *net, int pf,
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
- if (!IS_ERR(new_hooks))
+ if (!IS_ERR(new_hooks)) {
+ hooks_validate(new_hooks);
rcu_assign_pointer(*pp, new_hooks);
+ }
mutex_unlock(&nf_hook_mutex);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);
- hooks_validate(new_hooks);
#ifdef CONFIG_NETFILTER_INGRESS
if (nf_ingress_hook(reg, pf))
net_inc_ingress_queue();
@@ -621,7 +616,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
case NF_ACCEPT:
break;
case NF_DROP:
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_NETFILTER_DROP);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
@@ -666,32 +662,29 @@ EXPORT_SYMBOL(nf_hook_slow_list);
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
-struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
-struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-/* This does not belong here, but locally generated errors need it if connection
- tracking in use: without this, connection may not be in hash table, and hence
- manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
- __rcu __read_mostly;
-EXPORT_SYMBOL(ip_ct_attach);
-
-struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);
+/* This does not belong here, but locally generated errors need it if connection
+ * tracking in use: without this, connection may not be in hash table, and hence
+ * manufactured ICMP or RST packets will not be associated with it.
+ */
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
- void (*attach)(struct sk_buff *, const struct sk_buff *);
+ const struct nf_ct_hook *ct_hook;
if (skb->_nfct) {
rcu_read_lock();
- attach = rcu_dereference(ip_ct_attach);
- if (attach)
- attach(new, skb);
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->attach(new, skb);
rcu_read_unlock();
}
}
@@ -699,7 +692,7 @@ EXPORT_SYMBOL(nf_ct_attach);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
@@ -712,7 +705,7 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
bool ret = false;
rcu_read_lock();
@@ -751,10 +744,6 @@ static int __net_init netfilter_net_init(struct net *net)
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
-#if IS_ENABLED(CONFIG_DECNET)
- __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
-#endif
-
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 16ae92054baa..e7ba5b6dd2b7 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -353,7 +353,7 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
if (unlikely(!c))
return;
- strlcpy(c->str, ext->comment, len + 1);
+ strscpy(c->str, ext->comment, len + 1);
set->ext_size += sizeof(*c) + strlen(c->str) + 1;
rcu_assign_pointer(comment->c, c);
}
@@ -1072,7 +1072,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
if (!set)
return -ENOMEM;
spin_lock_init(&set->lock);
- strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ strscpy(set->name, name, IPSET_MAXNAMELEN);
set->family = family;
set->revision = revision;
@@ -1719,11 +1719,13 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
skb2 = nlmsg_new(payload, GFP_KERNEL);
if (!skb2)
return -ENOMEM;
- rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ rep = nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
errmsg = nlmsg_data(rep);
errmsg->error = ret;
- memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len,
+ /* Bounds checked by the skb layer. */);
+
cmdattr = (void *)&errmsg->msg + min_len;
ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr,
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 6e391308431d..3adc291d9ce1 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -42,31 +42,8 @@
#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
-
#define AHASH_MAX(h) ((h)->bucketsize)
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-static u8
-tune_bucketsize(u8 curr, u32 multi)
-{
- u32 n;
-
- if (multi < curr)
- return curr;
-
- n = curr + AHASH_INIT_SIZE;
- /* Currently, at listing one hash bucket must fit into a message.
- * Therefore we have a hard limit here.
- */
- return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
-}
-#define TUNE_BUCKETSIZE(h, multi) \
- ((h)->bucketsize = tune_bucketsize((h)->bucketsize, multi))
-#else
-#define TUNE_BUCKETSIZE(h, multi)
-#endif
-
/* A hash bucket */
struct hbucket {
struct rcu_head rcu; /* for call_rcu */
@@ -936,7 +913,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_BUCKETSIZE(h, multi);
+#ifdef IP_SET_HASH_WITH_MULTI
+ if (h->bucketsize >= AHASH_MAX_TUNED)
+ goto set_full;
+ else if (h->bucketsize < multi)
+ h->bucketsize += AHASH_INIT_SIZE;
+#endif
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index f9b16f2b2219..fdacbc3c15be 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -599,13 +599,19 @@ static const struct seq_operations ip_vs_app_seq_ops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
- sizeof(struct seq_net_private));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net,
+ &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+#endif
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
unregister_ip_vs_app(ipvs, NULL /* all */);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
+#endif
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 2c467c422dc6..13534e02346c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1265,8 +1265,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
+ static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static signed char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1308,7 +1308,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+ unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
@@ -1447,20 +1447,36 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
atomic_set(&ipvs->conn_count, 0);
- proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
- &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
- proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
- &ip_vs_conn_sync_seq_ops,
- sizeof(struct ip_vs_iter_state));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+ &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn;
+
+ if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn_sync;
+#endif
+
return 0;
+
+#ifdef CONFIG_PROC_FS
+err_conn_sync:
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+err_conn:
+ return -ENOMEM;
+#endif
}
void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
ip_vs_conn_flush(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
+#endif
}
int __init ip_vs_conn_init(void)
@@ -1495,7 +1511,7 @@ int __init ip_vs_conn_init(void)
pr_info("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
ip_vs_conn_tab_size,
- (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ (long)(ip_vs_conn_tab_size*sizeof(*ip_vs_conn_tab))/1024);
IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
sizeof(struct ip_vs_conn));
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 39c523bd775c..988222fff9f0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -960,8 +960,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Create a destination for the given service
*/
static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
- struct ip_vs_dest **dest_p)
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
unsigned int atype, i;
@@ -1021,8 +1020,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
- *dest_p = dest;
-
LeaveFunction(2);
return 0;
@@ -1096,7 +1093,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Allocate and initialize the dest structure
*/
- ret = ip_vs_new_dest(svc, udest, &dest);
+ ret = ip_vs_new_dest(svc, udest);
}
LeaveFunction(2);
@@ -1770,8 +1767,6 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
#ifdef CONFIG_SYSCTL
-static int three = 3;
-
static int
proc_do_defense_mode(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -1980,7 +1975,7 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &three,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "nat_icmp_send",
@@ -2616,7 +2611,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
+ strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2810,13 +2805,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
+ strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
+ strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
d[1].syncid = ipvs->bcfg.syncid;
}
@@ -3566,7 +3561,7 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
sizeof(c.mcast_ifn));
c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
@@ -4010,6 +4005,7 @@ static struct genl_family ip_vs_genl_family __ro_after_init = {
.module = THIS_MODULE,
.small_ops = ip_vs_genl_ops,
.n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .resv_start_op = IPVS_CMD_FLUSH + 1,
};
static int __init ip_vs_genl_register(void)
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index da0280cec506..e3d7f5c879ce 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -174,8 +174,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
return 0;
}
- table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
- sizeof(unsigned long), GFP_KERNEL);
+ table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -227,7 +226,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
}
out:
- kfree(table);
+ bitmap_free(table);
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 9d43277b8b4f..a56fd0b5a430 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1280,12 +1280,12 @@ static void set_sock_size(struct sock *sk, int mode, int val)
lock_sock(sk);
if (mode) {
val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
- sysctl_wmem_max);
+ READ_ONCE(sysctl_wmem_max));
sk->sk_sndbuf = val * 2;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
} else {
val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
- sysctl_rmem_max);
+ READ_ONCE(sysctl_rmem_max));
sk->sk_rcvbuf = val * 2;
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
}
diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c
index acb55d8393ef..f2579fc9c75b 100644
--- a/net/netfilter/ipvs/ip_vs_twos.c
+++ b/net/netfilter/ipvs/ip_vs_twos.c
@@ -71,8 +71,8 @@ static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc,
* from 0 to total_weight
*/
total_weight += 1;
- rweight1 = prandom_u32() % total_weight;
- rweight2 = prandom_u32() % total_weight;
+ rweight1 = prandom_u32_max(total_weight);
+ rweight2 = prandom_u32_max(total_weight);
/* Pick two weighted servers */
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index d2e5a8f644b8..029171379884 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -610,7 +610,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
nf_reset_ct(skb);
skb_forward_csum(skb);
if (skb->dev)
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
}
return ret;
}
@@ -652,7 +652,7 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
skb_forward_csum(skb);
if (skb->dev)
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -674,7 +674,7 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
if (skb->dev)
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 82f36beb2e76..5d8ed6c90b7e 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -132,6 +132,9 @@ static int __nf_conncount_add(struct net *net,
struct nf_conn *found_ct;
unsigned int collect = 0;
+ if (time_is_after_eq_jiffies((unsigned long)list->last_gc))
+ goto add_new_node;
+
/* check the saved connections */
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
if (collect > CONNCOUNT_GC_MAX_NODES)
@@ -177,6 +180,7 @@ static int __nf_conncount_add(struct net *net,
nf_ct_put(found_ct);
}
+add_new_node:
if (WARN_ON_ONCE(list->count > INT_MAX))
return -EOVERFLOW;
@@ -190,6 +194,7 @@ static int __nf_conncount_add(struct net *net,
conn->jiffies32 = (u32)jiffies;
list_add_tail(&conn->node, &list->head);
list->count++;
+ list->last_gc = (u32)jiffies;
return 0;
}
@@ -214,6 +219,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
list->count = 0;
+ list->last_gc = (u32)jiffies;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
@@ -227,6 +233,10 @@ bool nf_conncount_gc_list(struct net *net,
unsigned int collected = 0;
bool ret = false;
+ /* don't bother if we just did GC */
+ if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc)))
+ return false;
+
/* don't bother if other cpu is already doing GC */
if (!spin_trylock(&list->list_lock))
return false;
@@ -258,6 +268,7 @@ bool nf_conncount_gc_list(struct net *net,
if (!list->count)
ret = true;
+ list->last_gc = (u32)jiffies;
spin_unlock(&list->list_lock);
return ret;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 91bc8df3e4b0..385a5f458aba 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -22,26 +22,7 @@ static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
-static const struct nf_ct_ext_type acct_extend = {
- .len = sizeof(struct nf_conn_acct),
- .align = __alignof__(struct nf_conn_acct),
- .id = NF_CT_EXT_ACCT,
-};
-
void nf_conntrack_acct_pernet_init(struct net *net)
{
net->ct.sysctl_acct = nf_ct_acct;
}
-
-int nf_conntrack_acct_init(void)
-{
- int ret = nf_ct_extend_register(&acct_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_acct_fini(void)
-{
- nf_ct_extend_unregister(&acct_extend);
-}
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644
index 000000000000..8639e7efd0e2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx (xdp_md, __sk_buff)
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - Passed NULL for bpf_tuple pointer
+ * -EINVAL - opts->reserved is not 0
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ * -ENONET - No network namespace found for netns_id
+ * -ENOENT - Conntrack lookup could not find entry for tuple
+ * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ * or sizeof(tuple->ipv6)
+ * @l4proto - Layer 4 protocol
+ * Values:
+ * IPPROTO_TCP, IPPROTO_UDP
+ * @dir: - connection tracking tuple direction.
+ * @reserved - Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ */
+struct bpf_ct_opts {
+ s32 netns_id;
+ s32 error;
+ u8 l4proto;
+ u8 dir;
+ u8 reserved[2];
+};
+
+enum {
+ NF_BPF_CT_OPTS_SZ = 12,
+};
+
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, u8 protonum, u8 dir,
+ struct nf_conntrack_tuple *tuple)
+{
+ union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+ union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+ union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+ : &tuple->src.u;
+ union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+ : (void *)&tuple->dst.u;
+
+ if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+ return -EPROTO;
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ switch (tuple_len) {
+ case sizeof(bpf_tuple->ipv4):
+ tuple->src.l3num = AF_INET;
+ src->ip = bpf_tuple->ipv4.saddr;
+ sport->tcp.port = bpf_tuple->ipv4.sport;
+ dst->ip = bpf_tuple->ipv4.daddr;
+ dport->tcp.port = bpf_tuple->ipv4.dport;
+ break;
+ case sizeof(bpf_tuple->ipv6):
+ tuple->src.l3num = AF_INET6;
+ memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+ sport->tcp.port = bpf_tuple->ipv6.sport;
+ memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+ dport->tcp.port = bpf_tuple->ipv6.dport;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = dir;
+
+ return 0;
+}
+
+static struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
+ u32 timeout)
+{
+ struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts_len != NF_BPF_CT_OPTS_SZ)
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &otuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_REPLY, &rtuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+ GFP_ATOMIC);
+ if (IS_ERR(ct))
+ goto out;
+
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ __nf_ct_set_timeout(ct, timeout * HZ);
+
+out:
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return ct;
+}
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+ struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts,
+ u32 opts_len)
+{
+ struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
+ opts_len != NF_BPF_CT_OPTS_SZ)
+ return ERR_PTR(-EINVAL);
+ if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
+ return ERR_PTR(-EPROTO);
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &tuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+ if (opts->netns_id >= 0)
+ put_net(net);
+ if (!hash)
+ return ERR_PTR(-ENOENT);
+
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ opts->dir = NF_CT_DIRECTION(hash);
+
+ return ct;
+}
+
+BTF_ID_LIST(btf_nf_conn_ids)
+BTF_ID(struct, nf_conn)
+BTF_ID(struct, nf_conn___init)
+
+/* Check writes into `struct nf_conn` */
+static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ const struct btf_type *ncit;
+ const struct btf_type *nct;
+ size_t end;
+
+ ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
+ nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
+
+ if (t != nct && t != ncit) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ /* `struct nf_conn` and `struct nf_conn___init` have the same layout
+ * so we are safe to simply merge offset checks here
+ */
+ switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ case offsetof(struct nf_conn, mark):
+ end = offsetofend(struct nf_conn, mark);
+ break;
+#endif
+ default:
+ bpf_log(log, "no write support to nf_conn at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_conntrack BTF");
+
+/* bpf_xdp_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct nf_conn *nfct;
+
+ nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
+ opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = dev_net(ctx->rxq->dev);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_skb_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn___init *
+bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct nf_conn *nfct;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (12)
+ */
+struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_insert_entry - Add the provided entry into a CT map
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * @nfct - Pointer to referenced nf_conn___init object, obtained
+ * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ */
+struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+{
+ struct nf_conn *nfct = (struct nf_conn *)nfct_i;
+ int err;
+
+ nfct->status |= IPS_CONFIRMED;
+ err = nf_conntrack_hash_check_insert(nfct);
+ if (err < 0) {
+ nf_conntrack_free(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+void bpf_ct_release(struct nf_conn *nfct)
+{
+ if (!nfct)
+ return;
+ nf_ct_put(nfct);
+}
+
+/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
+ *
+ * Sets the default timeout of newly allocated nf_conn before insertion.
+ * This helper must be invoked for refcounted pointer to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @timeout - Timeout in msecs.
+ */
+void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+{
+ __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
+ *
+ * Change timeout associated of the inserted or looked up nf_conn.
+ * This helper must be invoked for refcounted pointer to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
+ * @timeout - New timeout in msecs.
+ */
+int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+{
+ return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_set_status - Set status field of allocated nf_conn
+ *
+ * Set the status field of the newly allocated nf_conn before insertion.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @status - New status value.
+ */
+int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+{
+ return nf_ct_change_status_common((struct nf_conn *)nfct, status);
+}
+
+/* bpf_ct_change_status - Change status of inserted nf_conn
+ *
+ * Change the status field of the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @status - New status value.
+ */
+int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+{
+ return nf_ct_change_status_common(nfct, status);
+}
+
+__diag_pop()
+
+BTF_SET8_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_SET8_END(nf_ct_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ct_kfunc_set,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
+ if (!ret) {
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
+ mutex_unlock(&nf_conn_btf_access_lock);
+ }
+
+ return ret;
+}
+
+void cleanup_nf_conntrack_bpf(void)
+{
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = NULL;
+ mutex_unlock(&nf_conn_btf_access_lock);
+}
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 1ba6becc3079..9fb9b8031298 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -20,6 +20,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
unsigned int timeout)
{
+ const struct nf_conntrack_helper *helper;
struct nf_conntrack_expect *exp;
struct iphdr *iph = ip_hdr(skb);
struct rtable *rt = skb_rtable(skb);
@@ -58,7 +59,10 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
goto out;
exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ exp->tuple.src.u.udp.port = helper->tuple.src.u.udp.port;
exp->mask.src.u3.ip = mask;
exp->mask.src.u.udp.port = htons(0xFFFF);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4712a90a1820..f97bda06d2a9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -34,10 +34,10 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -66,6 +66,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
struct conntrack_gc_work {
struct delayed_work dwork;
u32 next_bucket;
+ u32 avg_timeout;
+ u32 count;
+ u32 start_time;
bool exiting;
bool early_drop;
};
@@ -77,8 +80,21 @@ static __read_mostly bool nf_conntrack_locks_all;
/* serialize hash resizes and nf_ct_iterate_cleanup */
static DEFINE_MUTEX(nf_conntrack_mutex);
-#define GC_SCAN_INTERVAL (120u * HZ)
+#define GC_SCAN_INTERVAL_MAX (60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN (1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ)
+
+/* Initial bias pretending we have 100 entries at the upper bound so we don't
+ * wakeup often just because we have three entries with a 1s timeout while still
+ * allowing non-idle machines to wakeup more often when needed.
+ */
+#define GC_SCAN_INITIAL_COUNT 100
+#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX
+
#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
+#define GC_SCAN_EXPIRED_MAX (64000u / HZ)
#define MIN_CHAINLEN 8u
#define MAX_CHAINLEN (32u - MIN_CHAINLEN)
@@ -189,7 +205,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
-static siphash_key_t nf_conntrack_hash_rnd __read_mostly;
+static siphash_aligned_key_t nf_conntrack_hash_rnd;
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
unsigned int zoneid,
@@ -316,20 +332,18 @@ nf_ct_get_tuple(const struct sk_buff *skb,
return gre_pkt_to_tuple(skb, dataoff, net, tuple);
#endif
case IPPROTO_TCP:
- case IPPROTO_UDP: /* fallthrough */
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
+ case IPPROTO_UDP:
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
case IPPROTO_UDPLITE:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
case IPPROTO_SCTP:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
case IPPROTO_DCCP:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
+ /* fallthrough */
+ return nf_ct_get_tuple_ports(skb, dataoff, tuple);
default:
break;
}
@@ -482,7 +496,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
*/
u32 nf_ct_get_id(const struct nf_conn *ct)
{
- static __read_mostly siphash_key_t ct_id_seed;
+ static siphash_aligned_key_t ct_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
@@ -512,53 +526,9 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_dying_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) dying list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->dying);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) unconfirmed list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->unconfirmed);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* We overload first tuple to link into unconfirmed or dying list.*/
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- spin_unlock(&pcpu->lock);
-}
-
#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
-/* Released via destroy_conntrack() */
+/* Released via nf_ct_destroy() */
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
gfp_t flags)
@@ -585,7 +555,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
tmpl->status = IPS_TEMPLATE;
write_pnet(&tmpl->ct_net, net);
nf_ct_zone_add(tmpl, zone);
- atomic_set(&tmpl->ct_general.use, 0);
+ refcount_set(&tmpl->ct_general.use, 1);
return tmpl;
}
@@ -593,7 +563,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
- nf_ct_ext_destroy(tmpl);
+ kfree(tmpl->ext);
if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -612,13 +582,12 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
#endif
}
-static void
-destroy_conntrack(struct nf_conntrack *nfct)
+void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- pr_debug("destroy_conntrack(%p)\n", ct);
- WARN_ON(atomic_read(&nfct->use) != 0);
+ pr_debug("%s(%p)\n", __func__, ct);
+ WARN_ON(refcount_read(&nfct->use) != 0);
if (unlikely(nf_ct_is_template(ct))) {
nf_ct_tmpl_free(ct);
@@ -628,7 +597,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
destroy_gre_conntrack(ct);
- local_bh_disable();
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
@@ -636,26 +604,20 @@ destroy_conntrack(struct nf_conntrack *nfct)
*/
nf_ct_remove_expectations(ct);
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
- local_bh_enable();
-
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+ pr_debug("%s: returning ct=%p to slab\n", __func__, ct);
nf_conntrack_free(ct);
}
+EXPORT_SYMBOL(nf_ct_destroy);
-static void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
unsigned int sequence;
- nf_ct_helper_destroy(ct);
-
- local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
@@ -668,12 +630,30 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
+}
- nf_ct_add_to_dying_list(ct);
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+
+ __nf_ct_delete_from_lists(ct);
local_bh_enable();
}
+static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
+
+ spin_lock(&cnet->ecache.dying_lock);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &cnet->ecache.dying_list);
+ spin_unlock(&cnet->ecache.dying_lock);
+#endif
+}
+
bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
@@ -696,7 +676,12 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
- nf_ct_delete_from_lists(ct);
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+ __nf_ct_delete_from_lists(ct);
+ nf_ct_add_to_ecache_list(ct);
+ local_bh_enable();
+
nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
return false;
}
@@ -742,9 +727,12 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
- if (!atomic_inc_not_zero(&ct->ct_general.use))
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
return;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct))
nf_ct_kill(ct);
@@ -810,7 +798,10 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
* in, try to obtain a reference and re-check tuple
*/
ct = nf_ct_tuplehash_to_ctrack(h);
- if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+ if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
+ /* re-check key after refcount */
+ smp_acquire__after_ctrl_dep();
+
if (likely(nf_ct_key_equal(h, tuple, zone, net)))
goto found;
@@ -857,6 +848,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
&nf_conntrack_hash[reply_hash]);
}
+static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
+{
+ /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
+ * may contain stale pointers to e.g. helper that has been removed.
+ *
+ * The helper can't clear this because the nf_conn object isn't in
+ * any hash and synchronize_rcu() isn't enough because associated skb
+ * might sit in a queue.
+ */
+ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
+}
+
+static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
+{
+ if (!ext)
+ return true;
+
+ if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
+ return false;
+
+ /* inserted into conntrack table, nf_ct_iterate_cleanup()
+ * will find it. Disable nf_ct_ext_find() id check.
+ */
+ WRITE_ONCE(ext->gen_id, 0);
+ return true;
+}
+
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
@@ -872,6 +890,11 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
zone = nf_ct_zone(ct);
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ return -ETIMEDOUT;
+ }
+
local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
@@ -907,11 +930,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
smp_wmb();
/* The caller holds a reference to this object */
- atomic_set(&ct->ct_general.use, 2);
+ refcount_set(&ct->ct_general.use, 2);
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
local_bh_enable();
+
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC(net, drop);
+ return -ETIMEDOUT;
+ }
+
return 0;
chaintoolong:
NF_CT_STAT_INC(net, chaintoolong);
@@ -958,8 +988,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
{
struct nf_conn_tstamp *tstamp;
- atomic_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
+ refcount_inc(&ct->ct_general.use);
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
@@ -988,8 +1017,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
nf_conntrack_get(&ct->ct_general);
nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_ct_add_to_dying_list(loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_put(loser_ct);
nf_ct_set(skb, ct, ctinfo);
NF_CT_STAT_INC(net, clash_resolve);
@@ -1121,7 +1149,6 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
return ret;
drop:
- nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
@@ -1182,16 +1209,20 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_DROP;
}
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ goto dying;
+ }
+
pr_debug("Confirming conntrack %p\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
+ ct->status |= IPS_CONFIRMED;
if (unlikely(nf_ct_is_dying(ct))) {
- nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
@@ -1215,7 +1246,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
goto out;
if (chainlen++ > max_chainlen) {
chaintoolong:
- nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, chaintoolong);
NF_CT_STAT_INC(net, insert_failed);
ret = NF_DROP;
@@ -1239,6 +1269,16 @@ chaintoolong:
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
+ /* ext area is still valid (rcu read lock is held,
+ * but will go out of scope soon, we need to remove
+ * this conntrack again.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+ }
+
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
@@ -1351,9 +1391,12 @@ static unsigned int early_drop_list(struct net *net,
nf_ct_is_dying(tmp))
continue;
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->ct_net and ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
/* kill only if still in same netns -- might have moved due to
* SLAB_TYPESAFE_BY_RCU rules.
*
@@ -1420,16 +1463,31 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
static void gc_worker(struct work_struct *work)
{
- unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
unsigned int i, hashsz, nf_conntrack_max95 = 0;
- unsigned long next_run = GC_SCAN_INTERVAL;
+ u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
+ unsigned int expired_count = 0;
+ unsigned long next_run;
+ s32 delta_time;
+ long count;
+
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
i = gc_work->next_bucket;
if (gc_work->early_drop)
nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+ if (i == 0) {
+ gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+ gc_work->count = GC_SCAN_INITIAL_COUNT;
+ gc_work->start_time = start_time;
+ }
+
+ next_run = gc_work->avg_timeout;
+ count = gc_work->count;
+
+ end_time = start_time + GC_SCAN_MAX_DURATION;
+
do {
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
@@ -1447,6 +1505,7 @@ static void gc_worker(struct work_struct *work)
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
struct nf_conntrack_net *cnet;
struct net *net;
+ long expires;
tmp = nf_ct_tuplehash_to_ctrack(h);
@@ -1455,11 +1514,30 @@ static void gc_worker(struct work_struct *work)
continue;
}
+ if (expired_count > GC_SCAN_EXPIRED_MAX) {
+ rcu_read_unlock();
+
+ gc_work->next_bucket = i;
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+
+ delta_time = nfct_time_stamp - gc_work->start_time;
+
+ /* re-sched immediately if total cycle time is exceeded */
+ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
+ goto early_exit;
+ }
+
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
+ expired_count++;
continue;
}
+ expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+ expires = (expires - (long)next_run) / ++count;
+ next_run += expires;
+
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
@@ -1469,16 +1547,21 @@ static void gc_worker(struct work_struct *work)
continue;
/* need to take reference to avoid possible races */
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (gc_worker_skip_ct(tmp)) {
nf_ct_put(tmp);
continue;
}
- if (gc_worker_can_early_drop(tmp))
+ if (gc_worker_can_early_drop(tmp)) {
nf_ct_kill(tmp);
+ expired_count++;
+ }
nf_ct_put(tmp);
}
@@ -1491,33 +1574,39 @@ static void gc_worker(struct work_struct *work)
cond_resched();
i++;
- if (time_after(jiffies, end_time) && i < hashsz) {
+ delta_time = nfct_time_stamp - end_time;
+ if (delta_time > 0 && i < hashsz) {
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
gc_work->next_bucket = i;
next_run = 0;
- break;
+ goto early_exit;
}
} while (i < hashsz);
+ gc_work->next_bucket = 0;
+
+ next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
+
+ delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+ if (next_run > (unsigned long)delta_time)
+ next_run -= delta_time;
+ else
+ next_run = 1;
+
+early_exit:
if (gc_work->exiting)
return;
- /*
- * Eviction will normally happen from the packet path, and not
- * from this gc worker.
- *
- * This worker is only here to reap expired entries when system went
- * idle after a busy period.
- */
- if (next_run) {
+ if (next_run)
gc_work->early_drop = false;
- gc_work->next_bucket = 0;
- }
+
queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
- INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
+ INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
gc_work->exiting = false;
}
@@ -1562,16 +1651,14 @@ __nf_conntrack_alloc(struct net *net,
ct->status = 0;
WRITE_ONCE(ct->timeout, 0);
write_pnet(&ct->ct_net, net);
- memset(&ct->__nfct_init_offset, 0,
- offsetof(struct nf_conn, proto) -
- offsetof(struct nf_conn, __nfct_init_offset));
+ memset_after(ct, 0, __nfct_init_offset);
nf_ct_zone_add(ct, zone);
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
- atomic_set(&ct->ct_general.use, 0);
+ refcount_set(&ct->ct_general.use, 0);
return ct;
out:
atomic_dec(&cnet->count);
@@ -1596,9 +1683,19 @@ void nf_conntrack_free(struct nf_conn *ct)
/* A freed object has refcnt == 0, that's
* the golden rule for SLAB_TYPESAFE_BY_RCU
*/
- WARN_ON(atomic_read(&ct->ct_general.use) != 0);
+ WARN_ON(refcount_read(&ct->ct_general.use) != 0);
+
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ const struct nf_nat_hook *nat_hook;
+
+ rcu_read_lock();
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (nat_hook)
+ nat_hook->remove_nat_bysrc(ct);
+ rcu_read_unlock();
+ }
- nf_ct_ext_destroy(ct);
+ kfree(ct->ext);
kmem_cache_free(nf_conntrack_cachep, ct);
cnet = nf_ct_pernet(net);
@@ -1619,7 +1716,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct nf_conntrack_ecache *ecache;
+#endif
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
@@ -1652,15 +1751,21 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
- nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
- ecache ? ecache->expmask : 0,
- GFP_ATOMIC);
- local_bh_disable();
+ if ((ecache || net->ct.sysctl_events) &&
+ !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
+ }
+#endif
+
cnet = nf_ct_pernet(net);
if (cnet->expect_count) {
- spin_lock(&nf_conntrack_expect_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
pr_debug("expectation arrives ct=%p exp=%p\n",
@@ -1683,16 +1788,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
#endif
NF_CT_STAT_INC(net, expect_new);
}
- spin_unlock(&nf_conntrack_expect_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
- if (!exp)
+ if (!exp && tmpl)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- /* Now it is inserted into the unconfirmed list, bump refcount */
- nf_conntrack_get(&ct->ct_general);
- nf_ct_add_to_unconfirmed_list(ct);
+ /* Other CPU might have obtained a pointer to this object before it was
+ * released. Because refcount is 0, refcount_inc_not_zero() will fail.
+ *
+ * After refcount_set(1) it will succeed; ensure that zeroing of
+ * ct->status and the correct ct->net pointer are visible; else other
+ * core might observe CONFIRMED bit which means the entry is valid and
+ * in the hash table, but its not (anymore).
+ */
+ smp_wmb();
- local_bh_enable();
+ /* Now it is going to be associated with an sk_buff, set refcount to 1. */
+ refcount_set(&ct->ct_general.use, 1);
if (exp) {
if (exp->expectfn)
@@ -1920,17 +2032,19 @@ repeat:
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
skb->_nfct = 0;
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- if (ret == -NF_DROP)
- NF_CT_STAT_INC_ATOMIC(state->net, drop);
/* Special case: TCP tracker reports an attempt to reopen a
* closed/aborted connection. We have to go back and create a
* fresh conntrack.
*/
if (ret == -NF_REPEAT)
goto repeat;
+
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+ if (ret == -NF_DROP)
+ NF_CT_STAT_INC_ATOMIC(state->net, drop);
+
ret = -ret;
goto out;
}
@@ -1962,10 +2076,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
if (ct->master || (help && !hlist_empty(&help->expectations)))
return;
-
- rcu_read_lock();
- __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
@@ -2084,9 +2194,9 @@ static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
+ const struct nf_nat_hook *nat_hook;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_nat_hook *nat_hook;
unsigned int status;
int dataoff;
u16 l3num;
@@ -2258,7 +2368,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
- void *data, unsigned int *bucket)
+ const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -2289,7 +2399,12 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
* tuple while iterating.
*/
ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
+
+ if (iter_data->net &&
+ !net_eq(iter_data->net, nf_ct_net(ct)))
+ continue;
+
+ if (iter(ct, iter_data->data))
goto found;
}
spin_unlock(lockp);
@@ -2299,14 +2414,14 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
return NULL;
found:
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
spin_unlock(lockp);
local_bh_enable();
return ct;
}
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
+ const struct nf_ct_iter_data *iter_data)
{
unsigned int bucket = 0;
struct nf_conn *ct;
@@ -2314,91 +2429,28 @@ static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
might_sleep();
mutex_lock(&nf_conntrack_mutex);
- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
+ while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
/* Time to push up daises... */
- nf_ct_delete(ct, portid, report);
+ nf_ct_delete(ct, iter_data->portid, iter_data->report);
nf_ct_put(ct);
cond_resched();
}
mutex_unlock(&nf_conntrack_mutex);
}
-struct iter_data {
- int (*iter)(struct nf_conn *i, void *data);
- void *data;
- struct net *net;
-};
-
-static int iter_net_only(struct nf_conn *i, void *data)
-{
- struct iter_data *d = data;
-
- if (!net_eq(d->net, nf_ct_net(i)))
- return 0;
-
- return d->iter(i, d->data);
-}
-
-static void
-__nf_ct_unconfirmed_destroy(struct net *net)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct nf_conntrack_tuple_hash *h;
- struct hlist_nulls_node *n;
- struct ct_pcpu *pcpu;
-
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_bh(&pcpu->lock);
- hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
- struct nf_conn *ct;
-
- ct = nf_ct_tuplehash_to_ctrack(h);
-
- /* we cannot call iter() on unconfirmed list, the
- * owning cpu can reallocate ct->ext at any time.
- */
- set_bit(IPS_DYING_BIT, &ct->status);
- }
- spin_unlock_bh(&pcpu->lock);
- cond_resched();
- }
-}
-
-void nf_ct_unconfirmed_destroy(struct net *net)
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+ const struct nf_ct_iter_data *iter_data)
{
+ struct net *net = iter_data->net;
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
might_sleep();
- if (atomic_read(&cnet->count) > 0) {
- __nf_ct_unconfirmed_destroy(net);
- nf_queue_nf_hook_drop(net);
- synchronize_net();
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
-
-void nf_ct_iterate_cleanup_net(struct net *net,
- int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
-{
- struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- struct iter_data d;
-
- might_sleep();
-
if (atomic_read(&cnet->count) == 0)
return;
- d.iter = iter;
- d.data = data;
- d.net = net;
-
- nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
+ nf_ct_iterate_cleanup(iter, iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
@@ -2416,6 +2468,7 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
void
nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
{
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
down_read(&net_rwsem);
@@ -2424,37 +2477,47 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
if (atomic_read(&cnet->count) == 0)
continue;
- __nf_ct_unconfirmed_destroy(net);
nf_queue_nf_hook_drop(net);
}
up_read(&net_rwsem);
/* Need to wait for netns cleanup worker to finish, if its
* running -- it might have deleted a net namespace from
- * the global list, so our __nf_ct_unconfirmed_destroy() might
- * not have affected all namespaces.
+ * the global list, so hook drop above might not have
+ * affected all namespaces.
*/
net_ns_barrier();
- /* a conntrack could have been unlinked from unconfirmed list
- * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
+ /* a skb w. unconfirmed conntrack could have been reinjected just
+ * before we called nf_queue_nf_hook_drop().
+ *
* This makes sure its inserted into conntrack table.
*/
synchronize_net();
- nf_ct_iterate_cleanup(iter, data, 0, 0);
+ nf_ct_ext_bump_genid();
+ iter_data.data = data;
+ nf_ct_iterate_cleanup(iter, &iter_data);
+
+ /* Another cpu might be in a rcu read section with
+ * rcu protected pointer cleared in iter callback
+ * or hidden via nf_ct_ext_bump_genid() above.
+ *
+ * Wait until those are done.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
static int kill_all(struct nf_conn *i, void *data)
{
- return net_eq(nf_ct_net(i), data);
+ return 1;
}
void nf_conntrack_cleanup_start(void)
{
+ cleanup_nf_conntrack_bpf();
conntrack_gc_work.exiting = true;
- RCU_INIT_POINTER(ip_ct_attach, NULL);
}
void nf_conntrack_cleanup_end(void)
@@ -2464,13 +2527,7 @@ void nf_conntrack_cleanup_end(void)
kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
- nf_conntrack_seqadj_fini();
- nf_conntrack_labels_fini();
nf_conntrack_helper_fini();
- nf_conntrack_timeout_fini();
- nf_conntrack_ecache_fini();
- nf_conntrack_tstamp_fini();
- nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2490,8 +2547,9 @@ void nf_conntrack_cleanup_net(struct net *net)
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
- int busy;
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
+ int busy;
/*
* This makes sure all current packets have passed through
@@ -2504,7 +2562,8 @@ i_see_dead_people:
list_for_each_entry(net, net_exit_list, exit_list) {
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- nf_ct_iterate_cleanup(kill_all, net, 0, 0);
+ iter_data.net = net;
+ nf_ct_iterate_cleanup_net(kill_all, &iter_data);
if (atomic_read(&cnet->count) != 0)
busy = 1;
}
@@ -2517,7 +2576,6 @@ i_see_dead_people:
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
- free_percpu(net->ct.pcpu_lists);
}
}
@@ -2590,7 +2648,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = nf_conntrack_htable_size;
old_hash = nf_conntrack_hash;
nf_conntrack_hash = hash;
@@ -2626,36 +2683,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return nf_conntrack_hash_resize(hashsize);
}
-static __always_inline unsigned int total_extension_size(void)
-{
- /* remember to add new extensions below */
- BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
-
- return sizeof(struct nf_ct_ext) +
- sizeof(struct nf_conn_help)
-#if IS_ENABLED(CONFIG_NF_NAT)
- + sizeof(struct nf_conn_nat)
-#endif
- + sizeof(struct nf_conn_seqadj)
- + sizeof(struct nf_conn_acct)
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- + sizeof(struct nf_conntrack_ecache)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- + sizeof(struct nf_conn_tstamp)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- + sizeof(struct nf_conn_timeout)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
- + sizeof(struct nf_conn_labels)
-#endif
-#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
- + sizeof(struct nf_conn_synproxy)
-#endif
- ;
-};
-
int nf_conntrack_init_start(void)
{
unsigned long nr_pages = totalram_pages();
@@ -2663,9 +2690,6 @@ int nf_conntrack_init_start(void)
int ret = -ENOMEM;
int i;
- /* struct nf_ct_ext uses u8 to store offsets/size */
- BUILD_BUG_ON(total_extension_size() > 255u);
-
seqcount_spinlock_init(&nf_conntrack_generation,
&nf_conntrack_locks_all_lock);
@@ -2710,34 +2734,10 @@ int nf_conntrack_init_start(void)
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_init();
- if (ret < 0)
- goto err_acct;
-
- ret = nf_conntrack_tstamp_init();
- if (ret < 0)
- goto err_tstamp;
-
- ret = nf_conntrack_ecache_init();
- if (ret < 0)
- goto err_ecache;
-
- ret = nf_conntrack_timeout_init();
- if (ret < 0)
- goto err_timeout;
-
ret = nf_conntrack_helper_init();
if (ret < 0)
goto err_helper;
- ret = nf_conntrack_labels_init();
- if (ret < 0)
- goto err_labels;
-
- ret = nf_conntrack_seqadj_init();
- if (ret < 0)
- goto err_seqadj;
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -2745,23 +2745,18 @@ int nf_conntrack_init_start(void)
conntrack_gc_work_init(&conntrack_gc_work);
queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
+ ret = register_nf_conntrack_bpf();
+ if (ret < 0)
+ goto err_kfunc;
+
return 0;
+err_kfunc:
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+ nf_conntrack_proto_fini();
err_proto:
- nf_conntrack_seqadj_fini();
-err_seqadj:
- nf_conntrack_labels_fini();
-err_labels:
nf_conntrack_helper_fini();
err_helper:
- nf_conntrack_timeout_fini();
-err_timeout:
- nf_conntrack_ecache_fini();
-err_ecache:
- nf_conntrack_tstamp_fini();
-err_tstamp:
- nf_conntrack_acct_fini();
-err_acct:
nf_conntrack_expect_fini();
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2770,16 +2765,15 @@ err_cachep:
return ret;
}
-static struct nf_ct_hook nf_conntrack_hook = {
+static const struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
- .destroy = destroy_conntrack,
+ .destroy = nf_ct_destroy,
.get_tuple_skb = nf_conntrack_get_tuple_skb,
+ .attach = nf_conntrack_attach,
};
void nf_conntrack_init_end(void)
{
- /* For use by REJECT target */
- RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}
@@ -2787,33 +2781,19 @@ void nf_conntrack_init_end(void)
* We need to use special "null" values, not used in hash table
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
-#define DYING_NULLS_VAL ((1<<30)+1)
int nf_conntrack_init_net(struct net *net)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
int ret = -ENOMEM;
- int cpu;
BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
atomic_set(&cnet->count, 0);
- net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
- if (!net->ct.pcpu_lists)
- goto err_stat;
-
- for_each_possible_cpu(cpu) {
- struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_init(&pcpu->lock);
- INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- }
-
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat)
- goto err_pcpu_lists;
+ return ret;
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
@@ -2822,15 +2802,67 @@ int nf_conntrack_init_net(struct net *net)
nf_conntrack_acct_pernet_init(net);
nf_conntrack_tstamp_pernet_init(net);
nf_conntrack_ecache_pernet_init(net);
- nf_conntrack_helper_pernet_init(net);
nf_conntrack_proto_pernet_init(net);
return 0;
err_expect:
free_percpu(net->ct.stat);
-err_pcpu_lists:
- free_percpu(net->ct.pcpu_lists);
-err_stat:
return ret;
}
+
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
+{
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return -EPERM;
+
+ __nf_ct_set_timeout(ct, timeout);
+
+ if (test_bit(IPS_DYING_BIT, &ct->status))
+ return -ETIME;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
+
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
+{
+ unsigned int bit;
+
+ /* Ignore these unchangable bits */
+ on &= ~IPS_UNCHANGEABLE_MASK;
+ off &= ~IPS_UNCHANGEABLE_MASK;
+
+ for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+ if (on & (1 << bit))
+ set_bit(bit, &ct->status);
+ else if (off & (1 << bit))
+ clear_bit(bit, &ct->status);
+ }
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_status);
+
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
+{
+ unsigned long d;
+
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EBUSY;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ __nf_ct_change_status(ct, status, 0);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 41768ff19464..8698b3424646 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -16,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
-#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -29,8 +28,9 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-#define ECACHE_RETRY_WAIT (HZ/10)
-#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
+#define DYING_NULLS_VAL ((1 << 30) + 1)
+#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
+#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)
enum retry_state {
STATE_CONGESTED,
@@ -38,106 +38,100 @@ enum retry_state {
STATE_DONE,
};
-static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
- struct nf_conn *refs[ECACHE_STACK_ALLOC];
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return &cnet->ecache;
+}
+#if IS_MODULE(CONFIG_NF_CT_NETLINK)
+EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
+#endif
+
+static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
+{
+ unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
+ struct hlist_nulls_head evicted_list;
enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int evicted = 0;
+ unsigned int sent;
+
+ INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);
- spin_lock(&pcpu->lock);
+next:
+ sent = 0;
+ spin_lock_bh(&cnet->ecache.dying_lock);
- hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+ hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
- struct nf_conntrack_ecache *e;
-
- if (!nf_ct_is_confirmed(ct))
- continue;
-
- /* This ecache access is safe because the ct is on the
- * pcpu dying list and we hold the spinlock -- the entry
- * cannot be free'd until after the lock is released.
- *
- * This is true even if ct has a refcount of 0: the
- * cpu that is about to free the entry must remove it
- * from the dying list and needs the lock to do so.
- */
- e = nf_ct_ecache_find(ct);
- if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
- continue;
- /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
- * the worker owns this entry: the ct will remain valid
- * until the worker puts its ct reference.
+ /* The worker owns all entries, ct remains valid until nf_ct_put
+ * in the loop below.
*/
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}
- e->state = NFCT_ECACHE_DESTROY_SENT;
- refs[evicted] = ct;
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);
- if (++evicted >= ARRAY_SIZE(refs)) {
+ if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}
+
+ if (sent++ > 16) {
+ spin_unlock_bh(&cnet->ecache.dying_lock);
+ cond_resched();
+ goto next;
+ }
}
- spin_unlock(&pcpu->lock);
+ spin_unlock_bh(&cnet->ecache.dying_lock);
- /* can't _put while holding lock */
- while (evicted)
- nf_ct_put(refs[--evicted]);
+ hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+ nf_ct_put(ct);
+
+ cond_resched();
+ }
return ret;
}
static void ecache_work(struct work_struct *work)
{
- struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache_dwork.work);
- struct netns_ct *ctnet = cnet->ct_net;
- int cpu, delay = -1;
- struct ct_pcpu *pcpu;
-
- local_bh_disable();
-
- for_each_possible_cpu(cpu) {
- enum retry_state ret;
-
- pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
-
- ret = ecache_work_evict_list(pcpu);
-
- switch (ret) {
- case STATE_CONGESTED:
- delay = ECACHE_RETRY_WAIT;
- goto out;
- case STATE_RESTART:
- delay = 0;
- break;
- case STATE_DONE:
- break;
- }
+ struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
+ int ret, delay = -1;
+
+ ret = ecache_work_evict_list(cnet);
+ switch (ret) {
+ case STATE_CONGESTED:
+ delay = ECACHE_RETRY_JIFFIES;
+ break;
+ case STATE_RESTART:
+ delay = 0;
+ break;
+ case STATE_DONE:
+ break;
}
- out:
- local_bh_enable();
-
- ctnet->ecache_dwork_pending = delay > 0;
if (delay >= 0)
- schedule_delayed_work(&cnet->ecache_dwork, delay);
+ schedule_delayed_work(&cnet->ecache.dwork, delay);
}
static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
- const unsigned int events,
- const unsigned long missed,
+ const u32 events,
+ const u32 missed,
const struct nf_ct_event *item)
{
- struct nf_conn *ct = item->ct;
struct net *net = nf_ct_net(item->ct);
struct nf_ct_event_notifier *notify;
+ u32 old, want;
int ret;
if (!((events | missed) & e->ctmask))
@@ -157,12 +151,13 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
if (likely(ret >= 0 && missed == 0))
return 0;
- spin_lock_bh(&ct->lock);
- if (ret < 0)
- e->missed |= events;
- else
- e->missed &= ~missed;
- spin_unlock_bh(&ct->lock);
+ do {
+ old = READ_ONCE(e->missed);
+ if (ret < 0)
+ want = old | events;
+ else
+ want = old & ~missed;
+ } while (cmpxchg(&e->missed, old, want) != old);
return ret;
}
@@ -172,7 +167,7 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
{
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- unsigned long missed;
+ unsigned int missed;
int ret;
if (!nf_ct_is_confirmed(ct))
@@ -198,7 +193,6 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
*/
if (e->portid == 0 && portid != 0)
e->portid = portid;
- e->state = NFCT_ECACHE_DESTROY_FAIL;
}
return ret;
@@ -211,7 +205,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- unsigned long events;
+ unsigned int events;
if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
return;
@@ -292,52 +286,73 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
if (state == NFCT_ECACHE_DESTROY_FAIL &&
- !delayed_work_pending(&cnet->ecache_dwork)) {
- schedule_delayed_work(&cnet->ecache_dwork, HZ);
+ !delayed_work_pending(&cnet->ecache.dwork)) {
+ schedule_delayed_work(&cnet->ecache.dwork, HZ);
net->ct.ecache_dwork_pending = true;
} else if (state == NFCT_ECACHE_DESTROY_SENT) {
- net->ct.ecache_dwork_pending = false;
- mod_delayed_work(system_wq, &cnet->ecache_dwork, 0);
+ if (!hlist_nulls_empty(&cnet->ecache.dying_list))
+ mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
+ else
+ net->ct.ecache_dwork_pending = false;
}
}
-#define NF_CT_EVENTS_DEFAULT 1
-static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *e;
-static const struct nf_ct_ext_type event_extend = {
- .len = sizeof(struct nf_conntrack_ecache),
- .align = __alignof__(struct nf_conntrack_ecache),
- .id = NF_CT_EXT_ECACHE,
-};
+ switch (net->ct.sysctl_events) {
+ case 0:
+ /* assignment via template / ruleset? ignore sysctl. */
+ if (ctmask || expmask)
+ break;
+ return true;
+ case 2: /* autodetect: no event listener, don't allocate extension. */
+ if (!READ_ONCE(net->ct.ctnetlink_has_listener))
+ return true;
+ fallthrough;
+ case 1:
+ /* always allocate an extension. */
+ if (!ctmask && !expmask) {
+ ctmask = ~0;
+ expmask = ~0;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return true;
+ }
-void nf_conntrack_ecache_pernet_init(struct net *net)
-{
- struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
+ if (e) {
+ e->ctmask = ctmask;
+ e->expmask = expmask;
+ }
- net->ct.sysctl_events = nf_ct_events;
- cnet->ct_net = &net->ct;
- INIT_DELAYED_WORK(&cnet->ecache_dwork, ecache_work);
+ return e != NULL;
}
+EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
-void nf_conntrack_ecache_pernet_fini(struct net *net)
+#define NF_CT_EVENTS_DEFAULT 2
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+
+void nf_conntrack_ecache_pernet_init(struct net *net)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- cancel_delayed_work_sync(&cnet->ecache_dwork);
-}
-
-int nf_conntrack_ecache_init(void)
-{
- int ret = nf_ct_extend_register(&event_extend);
- if (ret < 0)
- pr_err("Unable to register event extension\n");
+ net->ct.sysctl_events = nf_ct_events;
- BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */
+ INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
+ INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
+ spin_lock_init(&cnet->ecache.dying_lock);
- return ret;
+ BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
-void nf_conntrack_ecache_fini(void)
+void nf_conntrack_ecache_pernet_fini(struct net *net)
{
- nf_ct_extend_unregister(&event_extend);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ cancel_delayed_work_sync(&cnet->ecache.dwork);
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index f562eeef4234..96948e98ec53 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
-static siphash_key_t nf_ct_expect_hashrnd __read_mostly;
+static siphash_aligned_key_t nf_ct_expect_hashrnd;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -203,12 +203,12 @@ nf_ct_find_expectation(struct net *net,
* about to invoke ->destroy(), or nf_ct_delete() via timeout
* or early_drop().
*
- * The atomic_inc_not_zero() check tells: If that fails, we
+ * The refcount_inc_not_zero() check tells: If that fails, we
* know that the ct is being destroyed. If it succeeds, we
* can be sure the ct cannot disappear underneath.
*/
if (unlikely(nf_ct_is_dying(exp->master) ||
- !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ !refcount_inc_not_zero(&exp->master->ct_general.use)))
return NULL;
if (exp->flags & NF_CT_EXPECT_PERMANENT) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 3dbe2329c3f1..0b513f7bf9f3 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -13,40 +13,92 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
-static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_nat.h>
+
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
-void nf_ct_ext_destroy(struct nf_conn *ct)
+atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);
+
+static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
+ [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
+#if IS_ENABLED(CONFIG_NF_NAT)
+ [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
+#endif
+ [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
+ [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_acct),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_tstamp),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
+#endif
+};
+
+static __always_inline unsigned int total_extension_size(void)
{
- unsigned int i;
- struct nf_ct_ext_type *t;
-
- for (i = 0; i < NF_CT_EXT_NUM; i++) {
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[i]);
-
- /* Here the nf_ct_ext_type might have been unregisterd.
- * I.e., it has responsible to cleanup private
- * area in all conntracks when it is unregisterd.
- */
- if (t && t->destroy)
- t->destroy(ct);
- rcu_read_unlock();
- }
-
- kfree(ct->ext);
+ /* remember to add new extensions below */
+ BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
+
+ return sizeof(struct nf_ct_ext) +
+ sizeof(struct nf_conn_help)
+#if IS_ENABLED(CONFIG_NF_NAT)
+ + sizeof(struct nf_conn_nat)
+#endif
+ + sizeof(struct nf_conn_seqadj)
+ + sizeof(struct nf_conn_acct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ + sizeof(struct nf_conntrack_ecache)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + sizeof(struct nf_conn_tstamp)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ + sizeof(struct nf_conn_timeout)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ + sizeof(struct nf_conn_labels)
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ + sizeof(struct nf_conn_synproxy)
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ + sizeof(struct nf_conn_act_ct_ext)
+#endif
+ ;
}
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int newlen, newoff, oldlen, alloc;
- struct nf_ct_ext_type *t;
struct nf_ct_ext *new;
/* Conntrack must not be confirmed to avoid races on reallocation. */
WARN_ON(nf_ct_is_confirmed(ct));
+ /* struct nf_ct_ext uses u8 to store offsets/size */
+ BUILD_BUG_ON(total_extension_size() > 255u);
if (ct->ext) {
const struct nf_ct_ext *old = ct->ext;
@@ -58,24 +110,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
oldlen = sizeof(*new);
}
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[id]);
- if (!t) {
- rcu_read_unlock();
- return NULL;
- }
-
- newoff = ALIGN(oldlen, t->align);
- newlen = newoff + t->len;
- rcu_read_unlock();
+ newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
+ newlen = newoff + nf_ct_ext_type_len[id];
alloc = max(newlen, NF_CT_EXT_PREALLOC);
new = krealloc(ct->ext, alloc, gfp);
if (!new)
return NULL;
- if (!ct->ext)
+ if (!ct->ext) {
memset(new->offset, 0, sizeof(new->offset));
+ new->gen_id = atomic_read(&nf_conntrack_ext_genid);
+ }
new->offset[id] = newoff;
new->len = newlen;
@@ -86,30 +132,28 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
}
EXPORT_SYMBOL(nf_ct_ext_add);
-/* This MUST be called in process context. */
-int nf_ct_extend_register(const struct nf_ct_ext_type *type)
+/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
{
- int ret = 0;
+ unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
+ unsigned int this_id = READ_ONCE(ext->gen_id);
- mutex_lock(&nf_ct_ext_type_mutex);
- if (nf_ct_ext_types[type->id]) {
- ret = -EBUSY;
- goto out;
- }
+ if (!__nf_ct_ext_exist(ext, id))
+ return NULL;
- rcu_assign_pointer(nf_ct_ext_types[type->id], type);
-out:
- mutex_unlock(&nf_ct_ext_type_mutex);
- return ret;
+ if (this_id == 0 || ext->gen_id == gen_id)
+ return (void *)ext + ext->offset[id];
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+EXPORT_SYMBOL(__nf_ct_ext_find);
-/* This MUST be called in process context. */
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type)
+void nf_ct_ext_bump_genid(void)
{
- mutex_lock(&nf_ct_ext_type_mutex);
- RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
- mutex_unlock(&nf_ct_ext_type_mutex);
- synchronize_rcu();
+ unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);
+
+ if (value == UINT_MAX)
+ atomic_set(&nf_conntrack_ext_genid, 1);
+
+ msleep(HZ);
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index a414274338cf..617f744a2e3a 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -33,10 +33,6 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
MODULE_ALIAS("ip_conntrack_ftp");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
-
-/* This is slow, but it's simple. --RR */
-static char *ftp_buffer;
-
static DEFINE_SPINLOCK(nf_ftp_lock);
#define MAX_PORTS 8
@@ -398,6 +394,9 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
}
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL)
return NF_ACCEPT;
@@ -411,12 +410,9 @@ static int help(struct sk_buff *skb,
}
datalen = skb->len - dataoff;
+ /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */
spin_lock_bh(&nf_ftp_lock);
- fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
- if (!fb_ptr) {
- spin_unlock_bh(&nf_ftp_lock);
- return NF_ACCEPT;
- }
+ fb_ptr = skb->data + dataoff;
ends_in_nl = (fb_ptr[datalen - 1] == '\n');
seq = ntohl(th->seq) + datalen;
@@ -571,7 +567,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
static void __exit nf_conntrack_ftp_fini(void)
{
nf_conntrack_helpers_unregister(ftp, ports_c * 2);
- kfree(ftp_buffer);
}
static int __init nf_conntrack_ftp_init(void)
@@ -580,10 +575,6 @@ static int __init nf_conntrack_ftp_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master));
- ftp_buffer = kmalloc(65536, GFP_KERNEL);
- if (!ftp_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
@@ -603,7 +594,6 @@ static int __init nf_conntrack_ftp_init(void)
ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(ftp_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 2eb31ffb3d14..5a9bce24f3c3 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -34,6 +34,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_h323.h>
+#define H323_MAX_SIZE 65535
+
/* Parameters */
static unsigned int default_rrq_ttl __read_mostly = 300;
module_param(default_rrq_ttl, uint, 0600);
@@ -49,64 +51,8 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
"if both endpoints are on different sides "
"(determined by routing information)");
-/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_sig_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*set_ras_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- __be16 port, __be16 rtp_port,
- struct nf_conntrack_expect *rtp_exp,
- struct nf_conntrack_expect *rtcp_exp) __read_mostly;
-int (*nat_t120_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_h245_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_callforwarding_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_q931_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, TransportAddress *taddr, int idx,
- __be16 port, struct nf_conntrack_expect *exp)
- __read_mostly;
+const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfct_h323_nat_hook);
static DEFINE_SPINLOCK(nf_h323_lock);
static char *h323_buffer;
@@ -142,6 +88,9 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen <= 0) /* No TCP data */
goto clear_out;
+ if (tcpdatalen > H323_MAX_SIZE)
+ tcpdatalen = H323_MAX_SIZE;
+
if (*data == NULL) { /* first TPKT */
/* Get first TPKT pointer */
tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
@@ -259,6 +208,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
@@ -266,7 +216,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
union nf_inet_addr addr;
struct nf_conntrack_expect *rtp_exp;
struct nf_conntrack_expect *rtcp_exp;
- typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
/* Read RTP or RTCP address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -296,15 +245,16 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
IPPROTO_UDP, NULL, &rtcp_port);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
- taddr, port, rtp_port, rtp_exp, rtcp_exp);
+ ret = nathook->nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ taddr, port, rtp_port, rtp_exp, rtcp_exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(rtp_exp, 0) == 0) {
if (nf_ct_expect_related(rtcp_exp, 0) == 0) {
@@ -333,12 +283,12 @@ static int expect_t120(struct sk_buff *skb,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_t120_hook) nat_t120;
/* Read T.120 address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -355,15 +305,16 @@ static int expect_t120(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_t120(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_h323: expect T.120 ");
@@ -664,18 +615,19 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
return 1;
}
+EXPORT_SYMBOL_GPL(get_h225_addr);
static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int protoff, unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_h245_hook) nat_h245;
/* Read h245Address */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
@@ -692,15 +644,16 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
IPPROTO_TCP, NULL, &port);
exp->helper = &nf_conntrack_helper_h245;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_h245(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect H.245 ");
@@ -785,13 +738,13 @@ static int expect_callforwarding(struct sk_buff *skb,
unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
struct net *net = nf_ct_net(ct);
- typeof(nat_callforwarding_hook) nat_callforwarding;
/* Read alternativeAddress */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
@@ -815,16 +768,17 @@ static int expect_callforwarding(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->helper = nf_conntrack_helper_q931;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Need NAT */
- ret = nat_callforwarding(skb, ct, ctinfo,
- protoff, data, dataoff,
- taddr, port, exp);
+ ret = nathook->nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect Call Forwarding ");
@@ -844,12 +798,12 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
Setup_UUIE *setup)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
int i;
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_q931: Setup\n");
@@ -860,9 +814,9 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
return -1;
}
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->destCallSignalAddress,
&addr, &port) &&
@@ -870,16 +824,16 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.src.u3,
- ct->tuplehash[!dir].tuple.src.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ ct->tuplehash[!dir].tuple.src.u.tcp.port);
if (ret < 0)
return -1;
}
if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
&addr, &port) &&
@@ -887,10 +841,10 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->sourceCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->sourceCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.dst.u.tcp.port);
if (ret < 0)
return -1;
}
@@ -1220,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NULL;
*datalen = skb->len - dataoff;
+ if (*datalen > H323_MAX_SIZE)
+ *datalen = H323_MAX_SIZE;
+
return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
}
@@ -1249,13 +1206,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
TransportAddress *taddr, int count)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
int i;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_q931_hook) nat_q931;
/* Look for the first related address */
for (i = 0; i < count; i++) {
@@ -1279,11 +1236,11 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
exp->helper = nf_conntrack_helper_q931;
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
- nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) { /* Need NAT */
- ret = nat_q931(skb, ct, ctinfo, protoff, data,
- taddr, i, port, exp);
+ ret = nathook->nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
@@ -1305,15 +1262,15 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, GatekeeperRequest *grq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: GRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) /* NATed */
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &grq->rasAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &grq->rasAddress, 1);
return 0;
}
@@ -1367,8 +1324,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationRequest *rrq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
pr_debug("nf_ct_ras: RRQ\n");
@@ -1378,12 +1335,12 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
if (ret < 0)
return -1;
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- rrq->rasAddress.item,
- rrq->rasAddress.count);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ rrq->rasAddress.item,
+ rrq->rasAddress.count);
if (ret < 0)
return -1;
}
@@ -1403,19 +1360,19 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationConfirm *rcf)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: RCF\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- rcf->callSignalAddress.item,
- rcf->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ rcf->callSignalAddress.item,
+ rcf->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1454,18 +1411,18 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, UnregistrationRequest *urq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: URQ\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- urq->callSignalAddress.item,
- urq->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ urq->callSignalAddress.item,
+ urq->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1487,39 +1444,42 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, AdmissionRequest *arq)
{
const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_ras: ARQ\n");
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (!nathook)
+ return 0;
+
if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->destCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
port == info->sig_port[dir] &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- set_h225_addr && ct->status & IPS_NAT_MASK) {
+ ct->status & IPS_NAT_MASK) {
/* Answering ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- info->sig_port[!dir]);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
}
if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
- set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Calling ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->srcCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- port);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->srcCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ port);
}
return 0;
@@ -1535,7 +1495,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: ACF\n");
@@ -1544,12 +1503,15 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
return 0;
if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+ const struct nfct_h323_nat_hooks *nathook;
+
/* Answering ACF */
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_sig_addr(skb, ct, ctinfo, protoff, data,
- &acf->destCallSignalAddress, 1);
+ return nathook->set_sig_addr(skb, ct, ctinfo, protoff,
+ data,
+ &acf->destCallSignalAddress, 1);
return 0;
}
@@ -1578,15 +1540,15 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, LocationRequest *lrq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: LRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &lrq->replyAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &lrq->replyAddress, 1);
return 0;
}
@@ -1634,27 +1596,22 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, InfoRequestResponse *irr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: IRR\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- &irr->rasAddress, 1);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &irr->rasAddress, 1);
if (ret < 0)
return -1;
- }
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- irr->callSignalAddress.item,
- irr->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ irr->callSignalAddress.item,
+ irr->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1821,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master));
- h323_buffer = kmalloc(65536, GFP_KERNEL);
+ h323_buffer = kmalloc(H323_MAX_SIZE + 1, GFP_KERNEL);
if (!h323_buffer)
return -ENOMEM;
ret = h323_helper_init();
@@ -1837,17 +1794,6 @@ err1:
module_init(nf_conntrack_h323_init);
module_exit(nf_conntrack_h323_fini);
-EXPORT_SYMBOL_GPL(get_h225_addr);
-EXPORT_SYMBOL_GPL(set_h245_addr_hook);
-EXPORT_SYMBOL_GPL(set_h225_addr_hook);
-EXPORT_SYMBOL_GPL(set_sig_addr_hook);
-EXPORT_SYMBOL_GPL(set_ras_addr_hook);
-EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
-EXPORT_SYMBOL_GPL(nat_t120_hook);
-EXPORT_SYMBOL_GPL(nat_h245_hook);
-EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
-EXPORT_SYMBOL_GPL(nat_q931_hook);
-
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 connection tracking helper");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index ae4488a13c70..ff737a76052e 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = false;
-module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
-MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 0)");
-
static DEFINE_MUTEX(nf_ct_nat_helpers_mutex);
static struct list_head nf_ct_nat_helpers __read_mostly;
@@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
(__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
}
-static struct nf_conntrack_helper *
-__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
- unsigned int h;
-
- if (!nf_ct_helper_count)
- return NULL;
-
- h = helper_hash(tuple);
- hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
- if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
- return helper;
- }
- return NULL;
-}
-
struct nf_conntrack_helper *
__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
{
@@ -165,7 +142,7 @@ nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
if (!nat) {
snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name);
rcu_read_unlock();
- request_module(mod_name);
+ request_module("%s", mod_name);
rcu_read_lock();
nat = nf_conntrack_nat_helper_find(mod_name);
@@ -209,33 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
-static struct nf_conntrack_helper *
-nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
-{
- struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-
- if (!cnet->sysctl_auto_assign_helper) {
- if (cnet->auto_assign_helper_warned)
- return NULL;
- if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
- return NULL;
- pr_info("nf_conntrack: default automatic helper assignment "
- "has been turned off for security reasons and CT-based "
- "firewall rule not found. Use the iptables CT target "
- "to attach helpers instead.\n");
- cnet->auto_assign_helper_warned = true;
- return NULL;
- }
-
- return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-}
-
int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
gfp_t flags)
{
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
- struct net *net = nf_ct_net(ct);
/* We already got a helper explicitly attached. The function
* nf_conntrack_alter_reply - in case NAT is in use - asks for looking
@@ -246,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
if (test_bit(IPS_HELPER_BIT, &ct->status))
return 0;
- if (tmpl != NULL) {
- help = nfct_help(tmpl);
- if (help != NULL) {
- helper = help->helper;
- set_bit(IPS_HELPER_BIT, &ct->status);
- }
+ if (WARN_ON_ONCE(!tmpl))
+ return 0;
+
+ help = nfct_help(tmpl);
+ if (help != NULL) {
+ helper = rcu_dereference(help->helper);
+ set_bit(IPS_HELPER_BIT, &ct->status);
}
help = nfct_help(ct);
if (helper == NULL) {
- helper = nf_ct_lookup_helper(ct, net);
- if (helper == NULL) {
- if (help)
- RCU_INIT_POINTER(help->helper, NULL);
- return 0;
- }
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ return 0;
}
if (help == NULL) {
@@ -468,11 +421,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
nf_ct_iterate_destroy(unhelp, me);
-
- /* Maybe someone has gotten the helper already when unhelp above.
- * So need to wait it.
- */
- synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
@@ -550,43 +498,19 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
}
EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
-static const struct nf_ct_ext_type helper_extend = {
- .len = sizeof(struct nf_conn_help),
- .align = __alignof__(struct nf_conn_help),
- .id = NF_CT_EXT_HELPER,
-};
-
-void nf_conntrack_helper_pernet_init(struct net *net)
-{
- struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-
- cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
-}
-
int nf_conntrack_helper_init(void)
{
- int ret;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
nf_ct_helper_hash =
nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
- ret = nf_ct_extend_register(&helper_extend);
- if (ret < 0) {
- pr_err("nf_ct_helper: Unable to register helper extension.\n");
- goto out_extend;
- }
-
INIT_LIST_HEAD(&nf_ct_nat_helpers);
return 0;
-out_extend:
- kvfree(nf_ct_helper_hash);
- return ret;
}
void nf_conntrack_helper_fini(void)
{
- nf_ct_extend_unregister(&helper_extend);
kvfree(nf_ct_helper_hash);
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 08ee4e760a3d..5703846bea3b 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -39,6 +39,7 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
#define HELPER_NAME "irc"
+#define MAX_SEARCH_SIZE 4095
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
@@ -121,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
int i, ret = NF_ACCEPT;
char *addr_beg_p, *addr_end_p;
typeof(nf_nat_irc_hook) nf_nat_irc;
+ unsigned int datalen;
/* If packet is coming from IRC server */
if (dir == IP_CT_DIR_REPLY)
@@ -140,8 +142,12 @@ static int help(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
+ datalen = skb->len - dataoff;
+ if (datalen > MAX_SEARCH_SIZE)
+ datalen = MAX_SEARCH_SIZE;
+
spin_lock_bh(&irc_buffer_lock);
- ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+ ib_ptr = skb_header_pointer(skb, dataoff, datalen,
irc_buffer);
if (!ib_ptr) {
spin_unlock_bh(&irc_buffer_lock);
@@ -149,17 +155,39 @@ static int help(struct sk_buff *skb, unsigned int protoff,
}
data = ib_ptr;
- data_limit = ib_ptr + skb->len - dataoff;
+ data_limit = ib_ptr + datalen;
+
+ /* Skip any whitespace */
+ while (data < data_limit - 10) {
+ if (*data == ' ' || *data == '\r' || *data == '\n')
+ data++;
+ else
+ break;
+ }
- /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
- * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
- while (data < data_limit - (19 + MINMATCHLEN)) {
- if (memcmp(data, "\1DCC ", 5)) {
+ /* strlen("PRIVMSG x ")=10 */
+ if (data < data_limit - 10) {
+ if (strncasecmp("PRIVMSG ", data, 8))
+ goto out;
+ data += 8;
+ }
+
+ /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26
+ * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26
+ */
+ while (data < data_limit - (21 + MINMATCHLEN)) {
+ /* Find first " :", the start of message */
+ if (memcmp(data, " :", 2)) {
data++;
continue;
}
+ data += 2;
+
+ /* then check that place only for the DCC command */
+ if (memcmp(data, "\1DCC ", 5))
+ goto out;
data += 5;
- /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+ /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */
iph = ip_hdr(skb);
pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
@@ -175,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
pr_debug("DCC %s detected\n", dccprotos[i]);
/* we have at least
- * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid
* data left (== 14/13 bytes) */
if (parse_dcc(data, data_limit, &dcc_ip,
&dcc_port, &addr_beg_p, &addr_end_p)) {
@@ -188,8 +216,9 @@ static int help(struct sk_buff *skb, unsigned int protoff,
/* dcc_ip can be the internal OR external (NAT'ed) IP */
tuple = &ct->tuplehash[dir].tuple;
- if (tuple->src.u3.ip != dcc_ip &&
- tuple->dst.u3.ip != dcc_ip) {
+ if ((tuple->src.u3.ip != dcc_ip &&
+ ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) ||
+ dcc_port == 0) {
net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
&tuple->src.u3.ip,
&dcc_ip, dcc_port);
@@ -251,7 +280,7 @@ static int __init nf_conntrack_irc_init(void)
irc_exp_policy.max_expected = max_dcc_channels;
irc_exp_policy.timeout = dcc_timeout;
- irc_buffer = kmalloc(65536, GFP_KERNEL);
+ irc_buffer = kmalloc(MAX_SEARCH_SIZE + 1, GFP_KERNEL);
if (!irc_buffer)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 522792556632..6e70e137a0a6 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -67,6 +67,8 @@ int nf_connlabels_get(struct net *net, unsigned int bits)
net->ct.labels_used++;
spin_unlock(&nf_connlabels_lock);
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_connlabels_get);
@@ -78,21 +80,3 @@ void nf_connlabels_put(struct net *net)
spin_unlock(&nf_connlabels_lock);
}
EXPORT_SYMBOL_GPL(nf_connlabels_put);
-
-static const struct nf_ct_ext_type labels_extend = {
- .len = sizeof(struct nf_conn_labels),
- .align = __alignof__(struct nf_conn_labels),
- .id = NF_CT_EXT_LABELS,
-};
-
-int nf_conntrack_labels_init(void)
-{
- BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
-
- return nf_ct_extend_register(&labels_extend);
-}
-
-void nf_conntrack_labels_fini(void)
-{
- nf_ct_extend_unregister(&labels_extend);
-}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 7f19ee259609..55415f011943 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -20,13 +20,14 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#define HELPER_NAME "netbios-ns"
#define NMBD_PORT 137
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_netbios_ns");
-MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
static unsigned int timeout __read_mostly = 3;
module_param(timeout, uint, 0400);
@@ -44,7 +45,7 @@ static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
}
static struct nf_conntrack_helper helper __read_mostly = {
- .name = "netbios-ns",
+ .name = HELPER_NAME,
.tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 81d03acf68d4..7562b215b932 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -58,6 +58,12 @@
MODULE_LICENSE("GPL");
+struct ctnetlink_list_dump_ctx {
+ struct nf_conn *last;
+ unsigned int cpu;
+ bool done;
+};
+
static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l4proto *l4proto)
@@ -508,7 +514,7 @@ nla_put_failure:
static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
- if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use))))
goto nla_put_failure;
return 0;
@@ -1195,12 +1201,11 @@ restart:
}
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_is_expired(ct)) {
+ /* need to defer nf_ct_kill() until lock is released */
if (i < ARRAY_SIZE(nf_ct_evict) &&
- atomic_inc_not_zero(&ct->ct_general.use))
+ refcount_inc_not_zero(&ct->ct_general.use))
nf_ct_evict[i++] = ct;
continue;
}
@@ -1208,6 +1213,9 @@ restart:
if (!net_eq(net, nf_ct_net(ct)))
continue;
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+
if (cb->args[1]) {
if (ct != last)
continue;
@@ -1552,6 +1560,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
+ struct nf_ct_iter_data iter = {
+ .net = net,
+ .portid = portid,
+ .report = report,
+ };
if (ctnetlink_needs_filter(family, cda)) {
if (cda[CTA_FILTER])
@@ -1560,10 +1573,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
+
+ iter.data = filter;
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
- portid, report);
+ nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
kfree(filter);
return 0;
@@ -1693,87 +1707,109 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
static int ctnetlink_done_list(struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+
+ if (ctx->last)
+ nf_ct_put(ctx->last);
+
return 0;
}
-static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int ctnetlink_dump_one_entry(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nf_conn *ct,
+ bool dying)
{
- struct nf_conn *ct, *last;
- struct nf_conntrack_tuple_hash *h;
- struct hlist_nulls_node *n;
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- u_int8_t l3proto = nfmsg->nfgen_family;
+ u8 l3proto = nfmsg->nfgen_family;
int res;
- int cpu;
- struct hlist_nulls_head *list;
- struct net *net = sock_net(skb->sk);
- if (cb->args[2])
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
return 0;
- last = (struct nf_conn *)cb->args[1];
-
- for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
- struct ct_pcpu *pcpu;
+ if (ctx->last) {
+ if (ct != ctx->last)
+ return 0;
- if (!cpu_possible(cpu))
- continue;
+ ctx->last = NULL;
+ }
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- spin_lock_bh(&pcpu->lock);
- list = dying ? &pcpu->dying : &pcpu->unconfirmed;
-restart:
- hlist_nulls_for_each_entry(h, n, list, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (l3proto && nf_ct_l3num(ct) != l3proto)
- continue;
- if (cb->args[1]) {
- if (ct != last)
- continue;
- cb->args[1] = 0;
- }
+ /* We can't dump extension info for the unconfirmed
+ * list because unconfirmed conntracks can have
+ * ct->ext reallocated (and thus freed).
+ *
+ * In the dying list case ct->ext can't be free'd
+ * until after we drop pcpu->lock.
+ */
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, dying, 0);
+ if (res < 0) {
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return 0;
- /* We can't dump extension info for the unconfirmed
- * list because unconfirmed conntracks can have
- * ct->ext reallocated (and thus freed).
- *
- * In the dying list case ct->ext can't be free'd
- * until after we drop pcpu->lock.
- */
- res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct, dying ? true : false, 0);
- if (res < 0) {
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- continue;
- cb->args[0] = cpu;
- cb->args[1] = (unsigned long)ct;
- spin_unlock_bh(&pcpu->lock);
- goto out;
- }
- }
- if (cb->args[1]) {
- cb->args[1] = 0;
- goto restart;
- }
- spin_unlock_bh(&pcpu->lock);
+ ctx->last = ct;
}
- cb->args[2] = 1;
-out:
- if (last)
- nf_ct_put(last);
- return skb->len;
+ return res;
+}
+#endif
+
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return 0;
}
static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
- return ctnetlink_dump_list(skb, cb, true);
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+ struct nf_conn *last = ctx->last;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ const struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_net_ecache *ecache_net;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+#endif
+
+ if (ctx->done)
+ return 0;
+
+ ctx->last = NULL;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ ecache_net = nf_conn_pernet_ecache(net);
+ spin_lock_bh(&ecache_net->dying_lock);
+
+ hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
+ struct nf_conn *ct;
+ int res;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (last && last != ct)
+ continue;
+
+ res = ctnetlink_dump_one_entry(skb, cb, ct, true);
+ if (res < 0) {
+ spin_unlock_bh(&ecache_net->dying_lock);
+ nf_ct_put(last);
+ return skb->len;
+ }
+
+ nf_ct_put(last);
+ last = NULL;
+ }
+
+ spin_unlock_bh(&ecache_net->dying_lock);
+#endif
+ ctx->done = true;
+ nf_ct_put(last);
+
+ return skb->len;
}
static int ctnetlink_get_ct_dying(struct sk_buff *skb,
@@ -1791,12 +1827,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
return -EOPNOTSUPP;
}
-static int
-ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, false);
-}
-
static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const cda[])
@@ -1819,7 +1849,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
const struct nlattr *attr)
__must_hold(RCU)
{
- struct nf_nat_hook *nat_hook;
+ const struct nf_nat_hook *nat_hook;
int err;
nat_hook = rcu_dereference(nf_nat_hook);
@@ -1861,45 +1891,10 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static void
-__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
- unsigned long off)
-{
- unsigned int bit;
-
- /* Ignore these unchangable bits */
- on &= ~IPS_UNCHANGEABLE_MASK;
- off &= ~IPS_UNCHANGEABLE_MASK;
-
- for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
- if (on & (1 << bit))
- set_bit(bit, &ct->status);
- else if (off & (1 << bit))
- clear_bit(bit, &ct->status);
- }
-}
-
static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
- unsigned long d;
- unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
- d = ct->status ^ status;
-
- if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
- /* unchangeable */
- return -EBUSY;
-
- if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
- /* SEEN_REPLY bit can only be set */
- return -EBUSY;
-
- if (d & IPS_ASSURED && !(status & IPS_ASSURED))
- /* ASSURED bit can only be set */
- return -EBUSY;
-
- __ctnetlink_change_status(ct, status, 0);
- return 0;
+ return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
}
static int
@@ -1975,7 +1970,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
}
if (help) {
- if (help->helper == helper) {
+ if (rcu_access_pointer(help->helper) == helper) {
/* update private helper data if allowed. */
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
@@ -1994,16 +1989,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
static int ctnetlink_change_timeout(struct nf_conn *ct,
const struct nlattr * const cda[])
{
- u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
-
- if (test_bit(IPS_DYING_BIT, &ct->status))
- return -ETIME;
-
- return 0;
+ return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
@@ -2263,9 +2249,7 @@ ctnetlink_create_conntrack(struct net *net,
goto err1;
timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = (u32)timeout + nfct_time_stamp;
+ __nf_ct_set_timeout(ct, timeout);
rcu_read_lock();
if (cda[CTA_HELP]) {
@@ -2310,14 +2294,10 @@ ctnetlink_create_conntrack(struct net *net,
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
- /* not in hash table yet so not strictly necessary */
+ /* disable helper auto-assignment for this entry */
+ ct->status |= IPS_HELPER;
RCU_INIT_POINTER(help->helper, helper);
}
- } else {
- /* try an implicit helper assignation */
- err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- if (err < 0)
- goto err2;
}
err = ctnetlink_setup_nat(ct, cda);
@@ -2806,7 +2786,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
* unchangeable bits but do not error out. Also user programs
* are allowed to clear the bits that they are allowed to change.
*/
- __ctnetlink_change_status(ct, status, ~status);
+ __nf_ct_change_status(ct, status, ~status);
return 0;
}
@@ -2921,7 +2901,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
}
-static struct nfnl_ct_hook ctnetlink_glue_hook = {
+static const struct nfnl_ct_hook ctnetlink_glue_hook = {
.build_size = ctnetlink_glue_build_size,
.build = ctnetlink_glue_build,
.parse = ctnetlink_glue_parse,
@@ -2995,7 +2975,7 @@ static const union nf_inet_addr any_addr;
static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
{
- static __read_mostly siphash_key_t exp_id_seed;
+ static siphash_aligned_key_t exp_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
@@ -3382,12 +3362,17 @@ static int ctnetlink_get_expect(struct sk_buff *skb,
static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data)
{
+ struct nf_conntrack_helper *helper;
const struct nf_conn_help *m_help;
const char *name = data;
m_help = nfct_help(exp->master);
- return strcmp(m_help->helper->name, name) == 0;
+ helper = rcu_dereference(m_help->helper);
+ if (!helper)
+ return false;
+
+ return strcmp(helper->name, name) == 0;
}
static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
@@ -3876,6 +3861,8 @@ static int __init ctnetlink_init(void)
{
int ret;
+ BUILD_BUG_ON(sizeof(struct ctnetlink_list_dump_ctx) > sizeof_field(struct netlink_callback, ctx));
+
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 7d5708b92138..4c679638df06 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -45,30 +45,8 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
static DEFINE_SPINLOCK(nf_pptp_lock);
-int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
-
-int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
-
-void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
- struct nf_conntrack_expect *expect_reply)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
-
-void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
- struct nf_conntrack_expect *exp) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
/* PptpControlMessageType names */
@@ -111,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name);
static void pptp_expectfn(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ const struct nf_nat_pptp_hook *hook;
struct net *net = nf_ct_net(ct);
- typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
pr_debug("increasing timeouts\n");
/* increase timeout of GRE data channel conntrack entry */
@@ -122,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct,
/* Can you see how rusty this code is, compared with the pre-2.6.11
* one? That's what happened to my shiny newnat of 2002 ;( -HW */
- nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
- if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
- nf_nat_pptp_expectfn(ct, exp);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->master->status & IPS_NAT_MASK)
+ hook->expectfn(ct, exp);
else {
struct nf_conntrack_tuple inv_t;
struct nf_conntrack_expect *exp_other;
@@ -209,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
{
struct nf_conntrack_expect *exp_orig, *exp_reply;
+ const struct nf_nat_pptp_hook *hook;
enum ip_conntrack_dir dir;
int ret = 1;
- typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
exp_orig = nf_ct_expect_alloc(ct);
if (exp_orig == NULL)
@@ -239,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
IPPROTO_GRE, &callid, &peer_callid);
exp_reply->expectfn = pptp_expectfn;
- nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
- if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
- nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ hook->exp_gre(exp_orig, exp_reply);
if (nf_ct_expect_related(exp_orig, 0) != 0)
goto out_put_both;
if (nf_ct_expect_related(exp_reply, 0) != 0)
@@ -279,9 +257,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
msg = ntohs(ctlh->messageType);
pr_debug("inbound control message %s\n", pptp_msg_name(msg));
@@ -383,10 +361,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
- if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_inbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
@@ -407,9 +384,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
msg = ntohs(ctlh->messageType);
pr_debug("outbound control message %s\n", pptp_msg_name(msg));
@@ -479,10 +456,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
- if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_outbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index d1f2d3c8d2b1..895b09cbd7cf 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -538,9 +538,13 @@ retry:
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
- if (fixup_needed)
- nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
- (void *)(unsigned long)nfproto, 0, 0);
+ if (fixup_needed) {
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = (void *)(unsigned long)nfproto,
+ };
+ nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
+ }
return err;
}
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 2394238d01c9..5a936334b517 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -489,6 +489,15 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
pr_debug("Setting vtag %x for dir %d\n",
ih->init_tag, !dir);
ct->proto.sctp.vtag[!dir] = ih->init_tag;
+
+ /* don't renew timeout on init retransmit so
+ * port reuse by client or NAT middlebox cannot
+ * keep entry alive indefinitely (incl. nat info).
+ */
+ if (new_state == SCTP_CONNTRACK_CLOSED &&
+ old_state == SCTP_CONNTRACK_CLOSED &&
+ nf_ct_is_confirmed(ct))
+ ignore = true;
}
ct->proto.sctp.state = new_state;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index af5115e127cf..656631083177 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -47,6 +47,12 @@ static const char *const tcp_conntrack_names[] = {
"SYN_SENT2",
};
+enum nf_ct_tcp_action {
+ NFCT_TCP_IGNORE,
+ NFCT_TCP_INVALID,
+ NFCT_TCP_ACCEPT,
+};
+
#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
@@ -341,8 +347,8 @@ static void tcp_options(const struct sk_buff *skb,
if (!ptr)
return;
- state->td_scale =
- state->flags = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
while (length > 0) {
int opcode=*ptr++;
@@ -446,24 +452,71 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
}
}
-static bool tcp_in_window(struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- unsigned int index,
- const struct sk_buff *skb,
- unsigned int dataoff,
- const struct tcphdr *tcph,
- const struct nf_hook_state *hook_state)
+static void tcp_init_sender(struct ip_ct_tcp_state *sender,
+ struct ip_ct_tcp_state *receiver,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ u32 end, u32 win)
+{
+ /* SYN-ACK in reply to a SYN
+ * or SYN from reply direction in simultaneous open.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ /* RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
+ sender->td_scale = 0;
+ receiver->td_scale = 0;
+ }
+}
+
+__printf(6, 7)
+static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb,
+ const struct nf_conn *ct,
+ const struct nf_hook_state *state,
+ const struct ip_ct_tcp_state *sender,
+ enum nf_ct_tcp_action ret,
+ const char *fmt, ...)
+{
+ const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct));
+ struct va_format vaf;
+ va_list args;
+ bool be_liberal;
+
+ be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal;
+ if (be_liberal)
+ return NFCT_TCP_ACCEPT;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf);
+ va_end(args);
+
+ return ret;
+}
+
+static enum nf_ct_tcp_action
+tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
+ unsigned int index, const struct sk_buff *skb,
+ unsigned int dataoff, const struct tcphdr *tcph,
+ const struct nf_hook_state *hook_state)
{
struct ip_ct_tcp *state = &ct->proto.tcp;
- struct net *net = nf_ct_net(ct);
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
- const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
__u32 seq, ack, sack, end, win, swin;
- u16 win_raw;
+ bool in_recv_win, seq_ok;
s32 receiver_offset;
- bool res, in_recv_win;
+ u16 win_raw;
/*
* Get the required data from the packet.
@@ -482,44 +535,17 @@ static bool tcp_in_window(struct nf_conn *ct,
ack -= receiver_offset;
sack -= receiver_offset;
- pr_debug("tcp_in_window: START\n");
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
if (sender->td_maxwin == 0) {
/*
* Initialize sender data.
*/
if (tcph->syn) {
- /*
- * SYN-ACK in reply to a SYN
- * or SYN from reply direction in simultaneous open.
- */
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
-
- tcp_options(skb, dataoff, tcph, sender);
- /*
- * RFC 1323:
- * Both sides must send the Window Scale option
- * to enable window scaling in either direction.
- */
- if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
- && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
- sender->td_scale =
- receiver->td_scale = 0;
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win);
if (!tcph->ack)
/* Simultaneous open */
- return true;
+ return NFCT_TCP_ACCEPT;
} else {
/*
* We are in the middle of a connection,
@@ -545,21 +571,24 @@ static bool tcp_in_window(struct nf_conn *ct,
}
}
- } else if (((state->state == TCP_CONNTRACK_SYN_SENT
- && dir == IP_CT_DIR_ORIGINAL)
- || (state->state == TCP_CONNTRACK_SYN_RECV
- && dir == IP_CT_DIR_REPLY))
- && after(end, sender->td_end)) {
+ } else if (tcph->syn &&
+ after(end, sender->td_end) &&
+ (state->state == TCP_CONNTRACK_SYN_SENT ||
+ state->state == TCP_CONNTRACK_SYN_RECV)) {
/*
* RFC 793: "if a TCP is reinitialized ... then it need
* not wait at all; it must only be sure to use sequence
* numbers larger than those recently used."
+ *
+ * Re-init state for this direction, just like for the first
+ * syn(-ack) reply, it might differ in seq, ack or tcp options.
*/
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win);
- tcp_options(skb, dataoff, tcph, sender);
+ if (dir == IP_CT_DIR_REPLY && !tcph->ack)
+ return NFCT_TCP_ACCEPT;
}
if (!(tcph->ack)) {
@@ -583,113 +612,166 @@ static bool tcp_in_window(struct nf_conn *ct,
*/
seq = end = sender->td_end;
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
+ seq_ok = before(seq, sender->td_maxend + 1);
+ if (!seq_ok) {
+ u32 overshot = end - sender->td_maxend + 1;
+ bool ack_ok;
+
+ ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1);
+ in_recv_win = receiver->td_maxwin &&
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
+ if (in_recv_win &&
+ ack_ok &&
+ overshot <= receiver->td_maxwin &&
+ before(sack, receiver->td_end + 1)) {
+ /* Work around TCPs that send more bytes than allowed by
+ * the receive window.
+ *
+ * If the (marked as invalid) packet is allowed to pass by
+ * the ruleset and the peer acks this data, then its possible
+ * all future packets will trigger 'ACK is over upper bound' check.
+ *
+ * Thus if only the sequence check fails then do update td_end so
+ * possible ACK for this data can update internal state.
+ */
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "%u bytes more than expected", overshot);
+ }
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "SEQ is over upper bound %u (over the window of the receiver)",
+ sender->td_maxend + 1);
+ }
+
+ if (!before(sack, receiver->td_end + 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "ACK is over upper bound %u (ACKed data not seen yet)",
+ receiver->td_end + 1);
/* Is the ending sequence in the receive window (if available)? */
in_recv_win = !receiver->td_maxwin ||
after(end, sender->td_end - receiver->td_maxwin - 1);
+ if (!in_recv_win)
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "SEQ is under lower bound %u (already ACKed data retransmitted)",
+ sender->td_end - receiver->td_maxwin - 1);
+ if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "ignored ACK under lower bound %u (possible overly delayed)",
+ receiver->td_end - MAXACKWINDOW(sender) - 1);
+
+ /* Take into account window scaling (RFC 1323). */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /* Update sender data. */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end)) {
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ }
+ if (tcph->ack) {
+ if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+ sender->td_maxack = ack;
+ sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+ } else if (after(ack, sender->td_maxack)) {
+ sender->td_maxack = ack;
+ }
+ }
- pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
- before(seq, sender->td_maxend + 1),
- (in_recv_win ? 1 : 0),
- before(sack, receiver->td_end + 1),
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+ /* Update receiver data. */
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+ if (ack == receiver->td_end)
+ receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ /* Check retransmissions. */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir &&
+ state->last_seq == seq &&
+ state->last_ack == ack &&
+ state->last_end == end &&
+ state->last_win == win_raw) {
+ state->retrans++;
+ } else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->last_win = win_raw;
+ state->retrans = 0;
+ }
+ }
- if (before(seq, sender->td_maxend + 1) &&
- in_recv_win &&
- before(sack, receiver->td_end + 1) &&
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
- /*
- * Take into account window scaling (RFC 1323).
- */
- if (!tcph->syn)
- win <<= sender->td_scale;
+ return NFCT_TCP_ACCEPT;
+}
- /*
- * Update sender data.
- */
- swin = win + (sack - ack);
- if (sender->td_maxwin < swin)
- sender->td_maxwin = swin;
- if (after(end, sender->td_end)) {
- sender->td_end = end;
- sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
- }
- if (tcph->ack) {
- if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
- sender->td_maxack = ack;
- sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
- } else if (after(ack, sender->td_maxack))
- sender->td_maxack = ack;
- }
+static void __cold nf_tcp_handle_invalid(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ int index,
+ const struct sk_buff *skb,
+ const struct nf_hook_state *hook_state)
+{
+ const unsigned int *timeouts;
+ const struct nf_tcp_net *tn;
+ unsigned int timeout;
+ u32 expires;
- /*
- * Update receiver data.
- */
- if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
- receiver->td_maxwin += end - sender->td_maxend;
- if (after(sack + win, receiver->td_maxend - 1)) {
- receiver->td_maxend = sack + win;
- if (win == 0)
- receiver->td_maxend++;
- }
- if (ack == receiver->td_end)
- receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return;
- /*
- * Check retransmissions.
- */
- if (index == TCP_ACK_SET) {
- if (state->last_dir == dir
- && state->last_seq == seq
- && state->last_ack == ack
- && state->last_end == end
- && state->last_win == win_raw)
- state->retrans++;
- else {
- state->last_dir = dir;
- state->last_seq = seq;
- state->last_ack = ack;
- state->last_end = end;
- state->last_win = win_raw;
- state->retrans = 0;
- }
- }
- res = true;
- } else {
- res = false;
- if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
- tn->tcp_be_liberal)
- res = true;
- if (!res) {
- nf_ct_l4proto_log_invalid(skb, ct, hook_state,
- "%s",
- before(seq, sender->td_maxend + 1) ?
- in_recv_win ?
- before(sack, receiver->td_end + 1) ?
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
- : "ACK is under the lower bound (possible overly delayed ACK)"
- : "ACK is over the upper bound (ACKed data not seen yet)"
- : "SEQ is under the lower bound (already ACKed data retransmitted)"
- : "SEQ is over the upper bound (over the window of the receiver)");
- }
+ /* We don't want to have connections hanging around in ESTABLISHED
+ * state for long time 'just because' conntrack deemed a FIN/RST
+ * out-of-window.
+ *
+ * Shrink the timeout just like when there is unacked data.
+ * This speeds up eviction of 'dead' connections where the
+ * connection and conntracks internal state are out of sync.
+ */
+ switch (index) {
+ case TCP_RST_SET:
+ case TCP_FIN_SET:
+ break;
+ default:
+ return;
}
- pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
- "receiver end=%u maxend=%u maxwin=%u\n",
- res, sender->td_end, sender->td_maxend, sender->td_maxwin,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+ if (ct->proto.tcp.last_dir != dir &&
+ (ct->proto.tcp.last_index == TCP_FIN_SET ||
+ ct->proto.tcp.last_index == TCP_RST_SET)) {
+ expires = nf_ct_expires(ct);
+ if (expires < 120 * HZ)
+ return;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = tn->timeouts;
+
+ timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]);
+ if (expires > timeout) {
+ nf_ct_l4proto_log_invalid(skb, ct, hook_state,
+ "packet (index %d, dir %d) response for index %d lower timeout to %u",
+ index, dir, ct->proto.tcp.last_index, timeout);
- return res;
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+ }
+ } else {
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+ }
}
/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
@@ -758,8 +840,6 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
enum tcp_conntrack new_state;
struct net *net = nf_ct_net(ct);
const struct nf_tcp_net *tn = nf_tcp_pernet(net);
- const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
- const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
/* Don't need lock here: this conntrack not in circulation yet */
new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
@@ -812,14 +892,6 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* tcp_packet will set them */
ct->proto.tcp.last_index = TCP_NONE_SET;
-
- pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- __func__,
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
return true;
}
@@ -839,6 +911,16 @@ static bool tcp_can_early_drop(const struct nf_conn *ct)
return false;
}
+static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
+{
+ state->td_end = 0;
+ state->td_maxend = 0;
+ state->td_maxwin = 0;
+ state->td_maxack = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
+}
+
/* Returns verdict for packet, or -1 for invalid. */
int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -851,6 +933,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
unsigned int index, *timeouts;
+ enum nf_ct_tcp_action res;
enum ip_conntrack_dir dir;
const struct tcphdr *th;
struct tcphdr _tcph;
@@ -945,8 +1028,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
ct->proto.tcp.last_flags;
- memset(&ct->proto.tcp.seen[dir], 0,
- sizeof(struct ip_ct_tcp_state));
+ nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
break;
}
ct->proto.tcp.last_index = index;
@@ -1009,10 +1091,11 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
}
/* Invalid packet */
- pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
- dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, state, "invalid state");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d invalid, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return -NF_ACCEPT;
case TCP_CONNTRACK_TIME_WAIT:
/* RFC5961 compliance cause stack to send "challenge-ACK"
@@ -1116,10 +1199,18 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
break;
}
- if (!tcp_in_window(ct, dir, index,
- skb, dataoff, th, state)) {
+ res = tcp_in_window(ct, dir, index,
+ skb, dataoff, th, state);
+ switch (res) {
+ case NFCT_TCP_IGNORE:
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ case NFCT_TCP_INVALID:
+ nf_tcp_handle_invalid(ct, dir, index, skb, state);
spin_unlock_bh(&ct->lock);
return -NF_ACCEPT;
+ case NFCT_TCP_ACCEPT:
+ break;
}
in_window:
/* From now on we have got in-window packets */
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index fcb33b1d5456..13dc421fc4f5 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -34,10 +34,6 @@ MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
MODULE_DESCRIPTION("SANE connection tracking helper");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
-static char *sane_buffer;
-
-static DEFINE_SPINLOCK(nf_sane_lock);
-
#define MAX_PORTS 8
static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c;
@@ -67,14 +63,16 @@ static int help(struct sk_buff *skb,
unsigned int dataoff, datalen;
const struct tcphdr *th;
struct tcphdr _tcph;
- void *sb_ptr;
int ret = NF_ACCEPT;
int dir = CTINFO2DIR(ctinfo);
struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple *tuple;
- struct sane_request *req;
struct sane_reply_net_start *reply;
+ union {
+ struct sane_request req;
+ struct sane_reply_net_start repl;
+ } buf;
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
@@ -92,59 +90,62 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
datalen = skb->len - dataoff;
-
- spin_lock_bh(&nf_sane_lock);
- sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
- if (!sb_ptr) {
- spin_unlock_bh(&nf_sane_lock);
- return NF_ACCEPT;
- }
-
if (dir == IP_CT_DIR_ORIGINAL) {
+ const struct sane_request *req;
+
if (datalen != sizeof(struct sane_request))
- goto out;
+ return NF_ACCEPT;
+
+ req = skb_header_pointer(skb, dataoff, datalen, &buf.req);
+ if (!req)
+ return NF_ACCEPT;
- req = sb_ptr;
if (req->RPC_code != htonl(SANE_NET_START)) {
/* Not an interesting command */
- ct_sane_info->state = SANE_STATE_NORMAL;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
+ return NF_ACCEPT;
}
/* We're interested in the next reply */
- ct_sane_info->state = SANE_STATE_START_REQUESTED;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_START_REQUESTED);
+ return NF_ACCEPT;
}
+ /* IP_CT_DIR_REPLY */
+
/* Is it a reply to an uninteresting command? */
- if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
- goto out;
+ if (READ_ONCE(ct_sane_info->state) != SANE_STATE_START_REQUESTED)
+ return NF_ACCEPT;
/* It's a reply to SANE_NET_START. */
- ct_sane_info->state = SANE_STATE_NORMAL;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
if (datalen < sizeof(struct sane_reply_net_start)) {
pr_debug("NET_START reply too short\n");
- goto out;
+ return NF_ACCEPT;
}
- reply = sb_ptr;
+ datalen = sizeof(struct sane_reply_net_start);
+
+ reply = skb_header_pointer(skb, dataoff, datalen, &buf.repl);
+ if (!reply)
+ return NF_ACCEPT;
+
if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
/* saned refused the command */
pr_debug("unsuccessful SANE_STATUS = %u\n",
ntohl(reply->status));
- goto out;
+ return NF_ACCEPT;
}
/* Invalid saned reply? Ignore it. */
if (reply->zero != 0)
- goto out;
+ return NF_ACCEPT;
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
nf_ct_helper_log(skb, ct, "cannot alloc expectation");
- ret = NF_DROP;
- goto out;
+ return NF_DROP;
}
tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
@@ -162,9 +163,6 @@ static int help(struct sk_buff *skb,
}
nf_ct_expect_put(exp);
-
-out:
- spin_unlock_bh(&nf_sane_lock);
return ret;
}
@@ -178,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
static void __exit nf_conntrack_sane_fini(void)
{
nf_conntrack_helpers_unregister(sane, ports_c * 2);
- kfree(sane_buffer);
}
static int __init nf_conntrack_sane_init(void)
@@ -187,10 +184,6 @@ static int __init nf_conntrack_sane_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master));
- sane_buffer = kmalloc(65536, GFP_KERNEL);
- if (!sane_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = SANE_PORT;
@@ -210,7 +203,6 @@ static int __init nf_conntrack_sane_init(void)
ret = nf_conntrack_helpers_register(sane, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(sane_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 3066449f8bd8..7ab2b25b57bc 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -232,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct,
this_way->offset_after : this_way->offset_before;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
-
-static const struct nf_ct_ext_type nf_ct_seqadj_extend = {
- .len = sizeof(struct nf_conn_seqadj),
- .align = __alignof__(struct nf_conn_seqadj),
- .id = NF_CT_EXT_SEQADJ,
-};
-
-int nf_conntrack_seqadj_init(void)
-{
- return nf_ct_extend_register(&nf_ct_seqadj_extend);
-}
-
-void nf_conntrack_seqadj_fini(void)
-{
- nf_ct_extend_unregister(&nf_ct_seqadj_extend);
-}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index b83dc9bf0a5d..77f5e82d8e3f 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -60,7 +60,7 @@ module_param(sip_external_media, int, 0600);
MODULE_PARM_DESC(sip_external_media, "Expect Media streams between external "
"endpoints (default 0)");
-const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks;
EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
static int string_len(const struct nf_conn *ct, const char *dptr,
@@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
return ret;
if (ret == 0)
break;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
*in_header = 0;
}
@@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
break;
if (ret == 0)
return ret;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
if (in_header)
@@ -1229,6 +1229,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
struct nf_conntrack_expect *exp;
union nf_inet_addr *saddr, daddr;
const struct nf_nat_sip_hooks *hooks;
+ struct nf_conntrack_helper *helper;
__be16 port;
u8 proto;
unsigned int expires = 0;
@@ -1289,10 +1290,14 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
if (sip_direct_signalling)
saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ helper = rcu_dereference(nfct_help(ct)->helper);
+ if (!helper)
+ return NF_DROP;
+
nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
saddr, &daddr, proto, NULL, &port);
exp->timeout.expires = sip_timeout * HZ;
- exp->helper = nfct_help(ct)->helper;
+ exp->helper = helper;
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
hooks = rcu_dereference(nf_nat_sip_hooks);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 80f675d884b2..4ffe84c5a82c 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -303,9 +303,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
WARN_ON(!ct);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use)))
return 0;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct)) {
nf_ct_kill(ct);
goto release;
@@ -370,7 +373,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
ct_show_delta_time(s, ct);
- seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+ seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use));
if (seq_has_overflowed(s))
goto release;
@@ -558,7 +561,6 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_LOG_INVALID,
NF_SYSCTL_CT_EXPECT_MAX,
NF_SYSCTL_CT_ACCT,
- NF_SYSCTL_CT_HELPER,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
NF_SYSCTL_CT_EVENTS,
#endif
@@ -677,14 +679,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- [NF_SYSCTL_CT_HELPER] = {
- .procname = "nf_conntrack_helper",
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
#ifdef CONFIG_NF_CONNTRACK_EVENTS
[NF_SYSCTL_CT_EVENTS] = {
.procname = "nf_conntrack_events",
@@ -693,7 +687,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_TWO,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
@@ -823,7 +817,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
-#if IS_ENABLED(CONFIG_NFT_FLOW_OFFLOAD)
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = {
.procname = "nf_flowtable_udp_timeout",
.maxlen = sizeof(unsigned int),
@@ -1097,7 +1091,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
- table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
#endif
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 14387e0b8008..0cc584d3dbb1 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -22,19 +22,21 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-struct nf_ct_timeout *
-(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-
-void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
static int untimeout(struct nf_conn *ct, void *timeout)
{
struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
- RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ if (timeout_ext) {
+ const struct nf_ct_timeout *t;
+
+ t = rcu_access_pointer(timeout_ext->timeout);
+
+ if (!timeout || t == timeout)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
/* We are not intended to delete this conntrack. */
return 0;
@@ -42,37 +44,41 @@ static int untimeout(struct nf_conn *ct, void *timeout)
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
{
- nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = timeout,
+ };
+
+ nf_ct_iterate_cleanup_net(untimeout, &iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_untimeout);
static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout)
{
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook);
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
- if (timeout_put)
- timeout_put(timeout);
+ if (h)
+ h->timeout_put(timeout);
}
int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
u8 l3num, u8 l4num, const char *timeout_name)
{
- typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+ const struct nf_ct_timeout_hooks *h;
struct nf_ct_timeout *timeout;
struct nf_conn_timeout *timeout_ext;
const char *errmsg = NULL;
int ret = 0;
rcu_read_lock();
- timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
- if (!timeout_find_get) {
+ h = rcu_dereference(nf_ct_timeout_hook);
+ if (!h) {
ret = -ENOENT;
errmsg = "Timeout policy base is empty";
goto out;
}
- timeout = timeout_find_get(net, timeout_name);
+ timeout = h->timeout_find_get(net, timeout_name);
if (!timeout) {
ret = -ENOENT;
pr_info_ratelimited("No such timeout policy \"%s\"\n",
@@ -119,37 +125,22 @@ EXPORT_SYMBOL_GPL(nf_ct_set_timeout);
void nf_ct_destroy_timeout(struct nf_conn *ct)
{
struct nf_conn_timeout *timeout_ext;
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h;
rcu_read_lock();
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+ h = rcu_dereference(nf_ct_timeout_hook);
- if (timeout_put) {
+ if (h) {
timeout_ext = nf_ct_timeout_find(ct);
if (timeout_ext) {
- timeout_put(timeout_ext->timeout);
+ struct nf_ct_timeout *t;
+
+ t = rcu_dereference(timeout_ext->timeout);
+ if (t)
+ h->timeout_put(t);
RCU_INIT_POINTER(timeout_ext->timeout, NULL);
}
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
-
-static const struct nf_ct_ext_type timeout_extend = {
- .len = sizeof(struct nf_conn_timeout),
- .align = __alignof__(struct nf_conn_timeout),
- .id = NF_CT_EXT_TIMEOUT,
-};
-
-int nf_conntrack_timeout_init(void)
-{
- int ret = nf_ct_extend_register(&timeout_extend);
- if (ret < 0)
- pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
- return ret;
-}
-
-void nf_conntrack_timeout_fini(void)
-{
- nf_ct_extend_unregister(&timeout_extend);
-}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index f656d393fa92..9e43a0a59e73 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -19,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly;
module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
-static const struct nf_ct_ext_type tstamp_extend = {
- .len = sizeof(struct nf_conn_tstamp),
- .align = __alignof__(struct nf_conn_tstamp),
- .id = NF_CT_EXT_TSTAMP,
-};
-
void nf_conntrack_tstamp_pernet_init(struct net *net)
{
net->ct.sysctl_tstamp = nf_ct_tstamp;
}
-
-int nf_conntrack_tstamp_init(void)
-{
- int ret;
- ret = nf_ct_extend_register(&tstamp_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_tstamp_fini(void)
-{
- nf_ct_extend_unregister(&tstamp_extend);
-}
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index a579e59ee5c5..a8e2425e43b0 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,14 +13,31 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+#define NF_RECURSION_LIMIT 2
+
+static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
+ enum nf_dev_hooks hook)
{
- if (skb_mac_header_was_set(skb))
+ if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+ goto err;
+
+ if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
+ if (skb_cow_head(skb, skb->mac_len))
+ goto err;
+
skb_push(skb, skb->mac_len);
+ }
skb->dev = dev;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
+ __this_cpu_inc(nf_dup_skb_recursion);
dev_queue_xmit(skb);
+ __this_cpu_dec(nf_dup_skb_recursion);
+ return;
+err:
+ kfree_skb(skb);
}
void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
@@ -33,7 +50,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
return;
}
- nf_do_netdev_egress(pkt->skb, dev);
+ nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
@@ -48,7 +65,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
skb = skb_clone(pkt->skb, GFP_ATOMIC);
if (skb)
- nf_do_netdev_egress(skb, dev);
+ nf_do_netdev_egress(skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index ed37bb9b4e58..81c26a96c30b 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -39,22 +39,28 @@ flow_offload_fill_dir(struct flow_offload *flow,
ft->l3proto = ctt->src.l3num;
ft->l4proto = ctt->dst.protonum;
- ft->src_port = ctt->src.u.tcp.port;
- ft->dst_port = ctt->dst.u.tcp.port;
+
+ switch (ctt->dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+ break;
+ }
}
struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
{
struct flow_offload *flow;
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(nf_ct_is_dying(ct)))
return NULL;
flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
if (!flow)
- goto err_ct_refcnt;
+ return NULL;
+ refcount_inc(&ct->ct_general.use);
flow->ct = ct;
flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
@@ -66,11 +72,6 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
__set_bit(NF_FLOW_DNAT, &flow->flags);
return flow;
-
-err_ct_refcnt:
- nf_ct_put(ct);
-
- return NULL;
}
EXPORT_SYMBOL_GPL(flow_offload_alloc);
@@ -173,12 +174,11 @@ EXPORT_SYMBOL_GPL(flow_offload_route_init);
static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
{
- tcp->state = TCP_CONNTRACK_ESTABLISHED;
tcp->seen[0].td_maxwin = 0;
tcp->seen[1].td_maxwin = 0;
}
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
@@ -187,7 +187,9 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
if (l4num == IPPROTO_TCP) {
struct nf_tcp_net *tn = nf_tcp_pernet(net);
- timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ flow_offload_fixup_tcp(&ct->proto.tcp);
+
+ timeout = tn->timeouts[ct->proto.tcp.state];
timeout -= tn->offload_timeout;
} else if (l4num == IPPROTO_UDP) {
struct nf_udp_net *tn = nf_udp_pernet(net);
@@ -205,18 +207,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
}
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
- if (nf_ct_protonum(ct) == IPPROTO_TCP)
- flow_offload_fixup_tcp(&ct->proto.tcp);
-}
-
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
- flow_offload_fixup_ct_state(ct);
- flow_offload_fixup_ct_timeout(ct);
-}
-
static void flow_offload_route_release(struct flow_offload *flow)
{
nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
@@ -329,8 +319,10 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
u32 timeout;
timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
- if (READ_ONCE(flow->timeout) != timeout)
+ if (timeout - READ_ONCE(flow->timeout) > HZ)
WRITE_ONCE(flow->timeout, timeout);
+ else
+ return;
if (likely(!nf_flowtable_hw_offload(flow_table)))
return;
@@ -353,22 +345,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
-
- clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
-
- if (nf_flow_has_expired(flow))
- flow_offload_fixup_ct(flow->ct);
- else
- flow_offload_fixup_ct_timeout(flow->ct);
-
flow_offload_free(flow);
}
void flow_offload_teardown(struct flow_offload *flow)
{
+ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
set_bit(NF_FLOW_TEARDOWN, &flow->flags);
-
- flow_offload_fixup_ct_state(flow->ct);
+ flow_offload_fixup_ct(flow->ct);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -399,7 +383,8 @@ EXPORT_SYMBOL_GPL(flow_offload_lookup);
static int
nf_flow_table_iterate(struct nf_flowtable *flow_table,
- void (*iter)(struct flow_offload *flow, void *data),
+ void (*iter)(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data),
void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -423,7 +408,7 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
- iter(flow, data);
+ iter(flow_table, flow, data);
}
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
@@ -431,34 +416,12 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
return err;
}
-static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple)
-{
- struct dst_entry *dst;
-
- if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
- tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
- dst = tuple->dst_cache;
- if (!dst_check(dst, tuple->dst_cookie))
- return true;
- }
-
- return false;
-}
-
-static bool nf_flow_has_stale_dst(struct flow_offload *flow)
-{
- return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) ||
- flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple);
-}
-
-static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
+static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
{
- struct nf_flowtable *flow_table = data;
-
if (nf_flow_has_expired(flow) ||
- nf_ct_is_dying(flow->ct) ||
- nf_flow_has_stale_dst(flow))
- set_bit(NF_FLOW_TEARDOWN, &flow->flags);
+ nf_ct_is_dying(flow->ct))
+ flow_offload_teardown(flow);
if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
@@ -474,12 +437,17 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
}
}
+void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
+{
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
+}
+
static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct nf_flowtable *flow_table;
flow_table = container_of(work, struct nf_flowtable, gc_work.work);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
+ nf_flow_table_gc_run(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
@@ -595,7 +563,8 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
}
EXPORT_SYMBOL_GPL(nf_flow_table_init);
-static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
@@ -636,24 +605,83 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_unlock(&flowtable_lock);
cancel_delayed_work_sync(&flow_table->gc_work);
- nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
nf_flow_table_offload_flush(flow_table);
- if (nf_flowtable_hw_offload(flow_table))
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step,
- flow_table);
+ /* ... no more pending work after this stage ... */
+ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+ nf_flow_table_gc_run(flow_table);
+ nf_flow_table_offload_flush_cleanup(flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+static int nf_flow_table_init_net(struct net *net)
+{
+ net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
+ return net->ft.stat ? 0 : -ENOMEM;
+}
+
+static void nf_flow_table_fini_net(struct net *net)
+{
+ free_percpu(net->ft.stat);
+}
+
+static int nf_flow_table_pernet_init(struct net *net)
+{
+ int ret;
+
+ ret = nf_flow_table_init_net(net);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_init_proc(net);
+ if (ret < 0)
+ goto out_proc;
+
+ return 0;
+
+out_proc:
+ nf_flow_table_fini_net(net);
+ return ret;
+}
+
+static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_flow_table_fini_proc(net);
+ nf_flow_table_fini_net(net);
+ }
+}
+
+static struct pernet_operations nf_flow_table_net_ops = {
+ .init = nf_flow_table_pernet_init,
+ .exit_batch = nf_flow_table_pernet_exit,
+};
+
static int __init nf_flow_table_module_init(void)
{
- return nf_flow_table_offload_init();
+ int ret;
+
+ ret = register_pernet_subsys(&nf_flow_table_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_offload_init();
+ if (ret)
+ goto out_offload;
+
+ return 0;
+
+out_offload:
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
+ return ret;
}
static void __exit nf_flow_table_module_exit(void)
{
nf_flow_table_offload_exit();
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
}
module_init(nf_flow_table_module_init);
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index bc4126d8ef65..0ccabf3fa6aa 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -6,12 +6,29 @@
#include <linux/rhashtable.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
+#include <linux/if_vlan.h>
static unsigned int
nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
+ struct vlan_ethhdr *veth;
+ __be16 proto;
+
switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ proto = veth->h_vlan_encapsulated_proto;
+ break;
+ case htons(ETH_P_PPP_SES):
+ proto = nf_flow_pppoe_proto(skb);
+ break;
+ default:
+ proto = skb->protocol;
+ break;
+ }
+
+ switch (proto) {
case htons(ETH_P_IP):
return nf_flow_offload_ip_hook(priv, skb, state);
case htons(ETH_P_IPV6):
@@ -54,8 +71,30 @@ static struct nf_flowtable_type flowtable_inet = {
.owner = THIS_MODULE,
};
+static struct nf_flowtable_type flowtable_ipv4 = {
+ .family = NFPROTO_IPV4,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv4,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ip_hook,
+ .owner = THIS_MODULE,
+};
+
+static struct nf_flowtable_type flowtable_ipv6 = {
+ .family = NFPROTO_IPV6,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv6,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ipv6_hook,
+ .owner = THIS_MODULE,
+};
+
static int __init nf_flow_inet_module_init(void)
{
+ nft_register_flowtable_type(&flowtable_ipv4);
+ nft_register_flowtable_type(&flowtable_ipv6);
nft_register_flowtable_type(&flowtable_inet);
return 0;
@@ -64,6 +103,8 @@ static int __init nf_flow_inet_module_init(void)
static void __exit nf_flow_inet_module_exit(void)
{
nft_unregister_flowtable_type(&flowtable_inet);
+ nft_unregister_flowtable_type(&flowtable_ipv6);
+ nft_unregister_flowtable_type(&flowtable_ipv4);
}
module_init(nf_flow_inet_module_init);
@@ -71,5 +112,7 @@ module_exit(nf_flow_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */
MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module");
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 889cf88d3dba..b350fe9d00b0 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -8,8 +8,6 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/if_ether.h>
-#include <linux/if_pppox.h>
-#include <linux/ppp_defs.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
@@ -172,6 +170,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
struct flow_ports *ports;
unsigned int thoff;
struct iphdr *iph;
+ u8 ipproto;
if (!pskb_may_pull(skb, sizeof(*iph) + offset))
return -1;
@@ -185,13 +184,19 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
thoff += offset;
- switch (iph->protocol) {
+ ipproto = iph->protocol;
+ switch (ipproto) {
case IPPROTO_TCP:
*hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
*hdrsize = sizeof(struct udphdr);
break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ *hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
default:
return -1;
}
@@ -202,15 +207,29 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
iph = (struct iphdr *)(skb_network_header(skb) + offset);
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v4.s_addr = iph->saddr;
tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
+ tuple->l4proto = ipproto;
tuple->iifidx = dev->ifindex;
nf_flow_tuple_encap(skb, tuple);
@@ -229,6 +248,15 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
return true;
}
+static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple)
+{
+ if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH &&
+ tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM)
+ return true;
+
+ return dst_check(tuple->dst_cache, tuple->dst_cookie);
+}
+
static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
const struct nf_hook_state *state,
struct dst_entry *dst)
@@ -239,22 +267,6 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
-{
- __be16 proto;
-
- proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
- sizeof(struct pppoe_hdr)));
- switch (proto) {
- case htons(PPP_IP):
- return htons(ETH_P_IP);
- case htons(PPP_IPV6):
- return htons(ETH_P_IPV6);
- }
-
- return 0;
-}
-
static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
u32 *offset)
{
@@ -364,6 +376,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
+ flow_offload_teardown(flow);
+ return NF_ACCEPT;
+ }
+
if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
@@ -376,7 +393,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
nf_flow_nat_ip(flow, skb, thoff, dir, iph);
ip_decrease_ttl(iph);
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
@@ -521,6 +538,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
struct flow_ports *ports;
struct ipv6hdr *ip6h;
unsigned int thoff;
+ u8 nexthdr;
thoff = sizeof(*ip6h) + offset;
if (!pskb_may_pull(skb, thoff))
@@ -528,13 +546,19 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
- switch (ip6h->nexthdr) {
+ nexthdr = ip6h->nexthdr;
+ switch (nexthdr) {
case IPPROTO_TCP:
*hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
*hdrsize = sizeof(struct udphdr);
break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ *hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
default:
return -1;
}
@@ -545,15 +569,29 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v6 = ip6h->saddr;
tuple->dst_v6 = ip6h->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET6;
- tuple->l4proto = ip6h->nexthdr;
+ tuple->l4proto = nexthdr;
tuple->iifidx = dev->ifindex;
nf_flow_tuple_encap(skb, tuple);
@@ -600,6 +638,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
return NF_ACCEPT;
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
+ flow_offload_teardown(flow);
+ return NF_ACCEPT;
+ }
+
if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
@@ -611,7 +654,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
nf_flow_nat_ipv6(flow, skb, dir, ip6h);
ip6h->hop_limit--;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index b561e0a44a45..b04645ced89b 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -20,7 +20,6 @@ static struct workqueue_struct *nf_flow_offload_stats_wq;
struct flow_offload_work {
struct list_head list;
enum flow_cls_command cmd;
- int priority;
struct nf_flowtable *flowtable;
struct flow_offload *flow;
struct work_struct work;
@@ -110,7 +109,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
nf_flow_rule_lwt_match(match, tun_info);
}
- key->meta.ingress_ifindex = tuple->iifidx;
+ if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC)
+ key->meta.ingress_ifindex = tuple->tc.iifidx;
+ else
+ key->meta.ingress_ifindex = tuple->iifidx;
+
mask->meta.ingress_ifindex = 0xffffffff;
if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) &&
@@ -170,6 +173,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP);
break;
case IPPROTO_UDP:
+ case IPPROTO_GRE:
break;
default:
return -EOPNOTSUPP;
@@ -178,15 +182,22 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
key->basic.ip_proto = tuple->l4proto;
mask->basic.ip_proto = 0xff;
- key->tp.src = tuple->src_port;
- mask->tp.src = 0xffff;
- key->tp.dst = tuple->dst_port;
- mask->tp.dst = 0xffff;
-
match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) |
BIT(FLOW_DISSECTOR_KEY_CONTROL) |
- BIT(FLOW_DISSECTOR_KEY_BASIC) |
- BIT(FLOW_DISSECTOR_KEY_PORTS);
+ BIT(FLOW_DISSECTOR_KEY_BASIC);
+
+ switch (tuple->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ key->tp.src = tuple->src_port;
+ mask->tp.src = 0xffff;
+ key->tp.dst = tuple->dst_port;
+ mask->tp.dst = 0xffff;
+
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_PORTS);
+ break;
+ }
+
return 0;
}
@@ -862,7 +873,8 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
return nf_flow_offload_tuple(offload->flowtable, offload->flow,
- flow_rule, dir, offload->priority,
+ flow_rule, dir,
+ offload->flowtable->priority,
FLOW_CLS_REPLACE, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -871,7 +883,8 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_DESTROY, NULL,
+ offload->flowtable->priority,
+ FLOW_CLS_DESTROY, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -922,7 +935,8 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload,
struct flow_stats *stats)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_STATS, stats,
+ offload->flowtable->priority,
+ FLOW_CLS_STATS, stats,
&offload->flowtable->flow_block.cb_list);
}
@@ -953,17 +967,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
static void flow_offload_work_handler(struct work_struct *work)
{
struct flow_offload_work *offload;
+ struct net *net;
offload = container_of(work, struct flow_offload_work, work);
+ net = read_pnet(&offload->flowtable->net);
switch (offload->cmd) {
case FLOW_CLS_REPLACE:
flow_offload_work_add(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add);
break;
case FLOW_CLS_DESTROY:
flow_offload_work_del(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del);
break;
case FLOW_CLS_STATS:
flow_offload_work_stats(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats);
break;
default:
WARN_ON_ONCE(1);
@@ -975,12 +994,18 @@ static void flow_offload_work_handler(struct work_struct *work)
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- if (offload->cmd == FLOW_CLS_REPLACE)
+ struct net *net = read_pnet(&offload->flowtable->net);
+
+ if (offload->cmd == FLOW_CLS_REPLACE) {
+ NF_FLOW_TABLE_STAT_INC(net, count_wq_add);
queue_work(nf_flow_offload_add_wq, &offload->work);
- else if (offload->cmd == FLOW_CLS_DESTROY)
+ } else if (offload->cmd == FLOW_CLS_DESTROY) {
+ NF_FLOW_TABLE_STAT_INC(net, count_wq_del);
queue_work(nf_flow_offload_del_wq, &offload->work);
- else
+ } else {
+ NF_FLOW_TABLE_STAT_INC(net, count_wq_stats);
queue_work(nf_flow_offload_stats_wq, &offload->work);
+ }
}
static struct flow_offload_work *
@@ -1000,7 +1025,6 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
offload->cmd = cmd;
offload->flow = flow;
- offload->priority = flowtable->priority;
offload->flowtable = flowtable;
INIT_WORK(&offload->work, flow_offload_work_handler);
@@ -1050,6 +1074,14 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
flow_offload_queue_work(offload);
}
+void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable)
+{
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_del_wq);
+ nf_flow_table_gc_run(flowtable);
+ }
+}
+
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
if (nf_flowtable_hw_offload(flowtable)) {
diff --git a/net/netfilter/nf_flow_table_procfs.c b/net/netfilter/nf_flow_table_procfs.c
new file mode 100644
index 000000000000..159b033a43e6
--- /dev/null
+++ b/net/netfilter/nf_flow_table_procfs.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static void *nf_flow_table_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *nf_flow_table_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+ (*pos)++;
+ return NULL;
+}
+
+static void nf_flow_table_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int nf_flow_table_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_flow_table_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "wq_add wq_del wq_stats\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%8d %8d %8d\n",
+ st->count_wq_add,
+ st->count_wq_del,
+ st->count_wq_stats
+ );
+ return 0;
+}
+
+static const struct seq_operations nf_flow_table_cpu_seq_ops = {
+ .start = nf_flow_table_cpu_seq_start,
+ .next = nf_flow_table_cpu_seq_next,
+ .stop = nf_flow_table_cpu_seq_stop,
+ .show = nf_flow_table_cpu_seq_show,
+};
+
+int nf_flow_table_init_proc(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_net("nf_flowtable", 0444, net->proc_net_stat,
+ &nf_flow_table_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+ return pde ? 0 : -ENOMEM;
+}
+
+void nf_flow_table_fini_proc(struct net *net)
+{
+ remove_proc_entry("nf_flowtable", net->proc_net_stat);
+}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index edee7fa944c1..8a29290149bd 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -443,9 +443,9 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
mutex_lock(&nf_log_mutex);
logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
if (!logger)
- strlcpy(buf, "NONE", sizeof(buf));
+ strscpy(buf, "NONE", sizeof(buf));
else
- strlcpy(buf, logger->name, sizeof(buf));
+ strscpy(buf, logger->name, sizeof(buf));
mutex_unlock(&nf_log_mutex);
r = proc_dostring(&tmp, write, buffer, lenp, ppos);
}
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
index 13234641cdb3..cb894f0d63e9 100644
--- a/net/netfilter/nf_log_syslog.c
+++ b/net/netfilter/nf_log_syslog.c
@@ -40,6 +40,12 @@ struct arppayload {
unsigned char ip_dst[4];
};
+/* Guard against containers flooding syslog. */
+static bool nf_log_allowed(const struct net *net)
+{
+ return net_eq(net, &init_net) || sysctl_nf_log_all_netns;
+}
+
static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb)
{
u16 vid;
@@ -61,7 +67,7 @@ dump_arp_packet(struct nf_log_buf *m,
unsigned int logflags;
struct arphdr _arph;
- ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph);
if (!ah) {
nf_log_buf_add(m, "TRUNCATED");
return;
@@ -90,7 +96,7 @@ dump_arp_packet(struct nf_log_buf *m,
ah->ar_pln != sizeof(__be32))
return;
- ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
+ ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp);
if (!ap) {
nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
skb->len - sizeof(_arph));
@@ -133,8 +139,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
{
struct nf_log_buf *m;
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
+ if (!nf_log_allowed(net))
return;
m = nf_log_buf_open();
@@ -144,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
prefix);
- dump_arp_packet(m, loginfo, skb, 0);
+ dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
nf_log_buf_close(m);
}
@@ -766,9 +771,9 @@ dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
}
-static void dump_ipv4_mac_header(struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
+static void dump_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
unsigned int logflags = 0;
@@ -798,9 +803,26 @@ fallback:
const unsigned char *p = skb_mac_header(skb);
unsigned int i;
- nf_log_buf_add(m, "%02x", *p++);
- for (i = 1; i < dev->hard_header_len; i++, p++)
- nf_log_buf_add(m, ":%02x", *p);
+ if (dev->type == ARPHRD_SIT) {
+ p -= ETH_HLEN;
+
+ if (p < skb->head)
+ p = NULL;
+ }
+
+ if (p) {
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++)
+ nf_log_buf_add(m, ":%02x", *p++);
+ }
+
+ if (dev->type == ARPHRD_SIT) {
+ const struct iphdr *iph =
+ (struct iphdr *)skb_mac_header(skb);
+
+ nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr,
+ &iph->daddr);
+ }
}
nf_log_buf_add(m, " ");
}
@@ -814,8 +836,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
{
struct nf_log_buf *m;
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
+ if (!nf_log_allowed(net))
return;
m = nf_log_buf_open();
@@ -827,9 +848,9 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
out, loginfo, prefix);
if (in)
- dump_ipv4_mac_header(m, loginfo, skb);
+ dump_mac_header(m, loginfo, skb);
- dump_ipv4_packet(net, m, loginfo, skb, 0);
+ dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb));
nf_log_buf_close(m);
}
@@ -841,64 +862,6 @@ static struct nf_logger nf_ip_logger __read_mostly = {
.me = THIS_MODULE,
};
-static void dump_ipv6_mac_header(struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- unsigned int logflags = 0;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
-
- if (!(logflags & NF_LOG_MACDECODE))
- goto fallback;
-
- switch (dev->type) {
- case ARPHRD_ETHER:
- nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
- nf_log_dump_vlan(m, skb);
- nf_log_buf_add(m, "MACPROTO=%04x ",
- ntohs(eth_hdr(skb)->h_proto));
- return;
- default:
- break;
- }
-
-fallback:
- nf_log_buf_add(m, "MAC=");
- if (dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- const unsigned char *p = skb_mac_header(skb);
- unsigned int len = dev->hard_header_len;
- unsigned int i;
-
- if (dev->type == ARPHRD_SIT) {
- p -= ETH_HLEN;
-
- if (p < skb->head)
- p = NULL;
- }
-
- if (p) {
- nf_log_buf_add(m, "%02x", *p++);
- for (i = 1; i < len; i++)
- nf_log_buf_add(m, ":%02x", *p++);
- }
- nf_log_buf_add(m, " ");
-
- if (dev->type == ARPHRD_SIT) {
- const struct iphdr *iph =
- (struct iphdr *)skb_mac_header(skb);
- nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr,
- &iph->daddr);
- }
- } else {
- nf_log_buf_add(m, " ");
- }
-}
-
static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
unsigned int hooknum, const struct sk_buff *skb,
const struct net_device *in,
@@ -908,8 +871,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
{
struct nf_log_buf *m;
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
+ if (!nf_log_allowed(net))
return;
m = nf_log_buf_open();
@@ -921,7 +883,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
loginfo, prefix);
if (in)
- dump_ipv6_mac_header(m, loginfo, skb);
+ dump_mac_header(m, loginfo, skb);
dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
@@ -935,6 +897,32 @@ static struct nf_logger nf_ip6_logger __read_mostly = {
.me = THIS_MODULE,
};
+static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+
+ dump_mac_header(m, loginfo, skb);
+
+ nf_log_buf_close(m);
+}
+
static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
@@ -954,6 +942,10 @@ static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
case htons(ETH_P_RARP):
nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
break;
+ default:
+ nf_log_unknown_packet(net, pf, hooknum, skb,
+ in, out, loginfo, prefix);
+ break;
}
}
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 3bc7e0854efe..98deef6cde69 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -44,19 +44,7 @@ static unsigned int help(struct sk_buff *skb,
exp->expectfn = nf_nat_follow_master;
/* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int res;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- res = nf_ct_expect_related(exp, 0);
- if (res == 0)
- break;
- else if (res != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
new file mode 100644
index 000000000000..0fa5a0bbb0ff
--- /dev/null
+++ b/net/netfilter/nf_nat_bpf.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable NAT Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_nat BTF");
+
+/* bpf_ct_set_nat_info - Set source or destination nat address
+ *
+ * Set source or destination nat address of the newly allocated
+ * nf_conn before insertion. This must be invoked for referenced
+ * PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @addr - Nat source/destination address
+ * @port - Nat source/destination port. Non-positive values are
+ * interpreted as select a random port.
+ * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
+ */
+int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+ union nf_inet_addr *addr, int port,
+ enum nf_nat_manip_type manip)
+{
+ struct nf_conn *ct = (struct nf_conn *)nfct;
+ u16 proto = nf_ct_l3num(ct);
+ struct nf_nat_range2 range;
+
+ if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
+ return -EINVAL;
+
+ memset(&range, 0, sizeof(struct nf_nat_range2));
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = *addr;
+ range.max_addr = range.min_addr;
+ if (port > 0) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto.all = cpu_to_be16(port);
+ range.max_proto.all = range.min_proto.all;
+ }
+
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(nf_nat_kfunc_set)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
+BTF_SET8_END(nf_nat_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_nat_kfunc_set,
+};
+
+int register_nf_nat_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_bpf_nat_kfunc_set);
+ if (ret)
+ return ret;
+
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &nf_bpf_nat_kfunc_set);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 4d50d51db796..e29e4ccb5c5a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -16,7 +16,7 @@
#include <linux/siphash.h>
#include <linux/rtnetlink.h>
-#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
-static siphash_key_t nf_nat_hash_rnd __read_mostly;
+static siphash_aligned_key_t nf_nat_hash_rnd;
struct nf_nat_lookup_hook_priv {
struct nf_hook_entries __rcu *entries;
@@ -468,7 +468,7 @@ find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
else
- off = prandom_u32();
+ off = get_random_u16();
attempts = range_size;
if (attempts > max_attempts)
@@ -490,7 +490,7 @@ another_round:
if (attempts >= range_size || attempts < 16)
return;
attempts /= 2;
- off = prandom_u32();
+ off = get_random_u16();
goto another_round;
}
@@ -801,7 +801,7 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0;
}
-static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
@@ -823,7 +823,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
- __nf_nat_cleanup_conntrack(ct);
+ nf_nat_cleanup_conntrack(ct);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -831,20 +831,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
return 0;
}
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
- if (ct->status & IPS_SRC_NAT_DONE)
- __nf_nat_cleanup_conntrack(ct);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
- .len = sizeof(struct nf_conn_nat),
- .align = __alignof__(struct nf_conn_nat),
- .destroy = nf_nat_cleanup_conntrack,
- .id = NF_CT_EXT_NAT,
-};
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -1130,12 +1116,13 @@ static struct pernet_operations nat_net_ops = {
.size = sizeof(struct nat_net),
};
-static struct nf_nat_hook nat_hook = {
+static const struct nf_nat_hook nat_hook = {
.parse_nat_setup = nfnetlink_parse_nat_setup,
#ifdef CONFIG_XFRM
.decode_session = __nf_nat_decode_session,
#endif
.manip_pkt = nf_nat_manip_pkt,
+ .remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
static int __init nf_nat_init(void)
@@ -1151,19 +1138,11 @@ static int __init nf_nat_init(void)
if (!nf_nat_bysource)
return -ENOMEM;
- ret = nf_ct_extend_register(&nat_extend);
- if (ret < 0) {
- kvfree(nf_nat_bysource);
- pr_err("Unable to register extension\n");
- return ret;
- }
-
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
ret = register_pernet_subsys(&nat_net_ops);
if (ret < 0) {
- nf_ct_extend_unregister(&nat_extend);
kvfree(nf_nat_bysource);
return ret;
}
@@ -1173,7 +1152,16 @@ static int __init nf_nat_init(void)
WARN_ON(nf_nat_hook != NULL);
RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
- return 0;
+ ret = register_nf_nat_bpf();
+ if (ret < 0) {
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ synchronize_net();
+ unregister_pernet_subsys(&nat_net_ops);
+ kvfree(nf_nat_bysource);
+ }
+
+ return ret;
}
static void __exit nf_nat_cleanup(void)
@@ -1182,7 +1170,6 @@ static void __exit nf_nat_cleanup(void)
nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
- nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_hook, NULL);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index aace6768a64e..c92a436d9c48 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -86,22 +86,9 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
- nf_ct_helper_log(skb, ct, "all ports in use");
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
}
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index a263505455fc..a95a25196943 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -198,3 +198,34 @@ void nf_nat_follow_master(struct nf_conn *ct,
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
}
EXPORT_SYMBOL(nf_nat_follow_master);
+
+u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port)
+{
+ static const unsigned int max_attempts = 128;
+ int range, attempts_left;
+ u16 min = port;
+
+ range = USHRT_MAX - port;
+ attempts_left = range;
+
+ if (attempts_left > max_attempts)
+ attempts_left = max_attempts;
+
+ /* Try to get same port: if not, try to change it. */
+ for (;;) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp, 0);
+ if (res == 0)
+ return port;
+
+ if (res != -EBUSY || (--attempts_left < 0))
+ break;
+
+ port = min + prandom_u32_max(range);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_exp_find_port);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
index c691ab8d234c..19c4fcc60c50 100644
--- a/net/netfilter/nf_nat_irc.c
+++ b/net/netfilter/nf_nat_irc.c
@@ -48,20 +48,8 @@ static unsigned int help(struct sk_buff *skb,
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp,
+ ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use");
return NF_DROP;
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index acd73f717a08..1a506b0c6511 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -12,6 +12,7 @@
struct masq_dev_work {
struct work_struct work;
struct net *net;
+ netns_tracker ns_tracker;
union nf_inet_addr addr;
int ifindex;
int (*iter)(struct nf_conn *i, void *data);
@@ -76,13 +77,16 @@ EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
static void iterate_cleanup_work(struct work_struct *work)
{
+ struct nf_ct_iter_data iter_data = {};
struct masq_dev_work *w;
w = container_of(work, struct masq_dev_work, work);
- nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0);
+ iter_data.net = w->net;
+ iter_data.data = (void *)w;
+ nf_ct_iterate_cleanup_net(w->iter, &iter_data);
- put_net(w->net);
+ put_net_track(w->net, &w->ns_tracker);
kfree(w);
atomic_dec(&masq_worker_count);
module_put(THIS_MODULE);
@@ -119,6 +123,7 @@ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
INIT_WORK(&w->work, iterate_cleanup_work);
w->ifindex = ifindex;
w->net = net;
+ netns_tracker_alloc(net, &w->ns_tracker, gfp_flags);
w->iter = iter;
if (addr)
w->addr = *addr;
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index f0a735e86851..cf4aeb299bde 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -410,19 +410,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
exp->dir = !dir;
exp->expectfn = nf_nat_sip_expected;
- for (; port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(exp, NF_CT_EXP_F_SKIP_MASTER);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, port);
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use for SIP");
return NF_DROP;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 6d12afabfe8a..63d1516816b1 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -46,6 +46,15 @@ void nf_unregister_queue_handler(void)
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
+static void nf_queue_sock_put(struct sock *sk)
+{
+#ifdef CONFIG_INET
+ sock_gen_put(sk);
+#else
+ sock_put(sk);
+#endif
+}
+
static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
@@ -54,7 +63,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
dev_put(state->in);
dev_put(state->out);
if (state->sk)
- sock_put(state->sk);
+ nf_queue_sock_put(state->sk);
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
dev_put(entry->physin);
@@ -87,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
}
/* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
+ if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+ return false;
+
dev_hold(state->in);
dev_hold(state->out);
- if (state->sk)
- sock_hold(state->sk);
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
dev_hold(entry->physin);
dev_hold(entry->physout);
#endif
+ return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
@@ -169,6 +180,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
break;
}
+ if (skb_sk_is_prefetched(skb)) {
+ struct sock *sk = skb->sk;
+
+ if (!sk_is_refcounted(sk)) {
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ return -ENOTCONN;
+
+ /* drop refcount on skb_orphan */
+ skb->destructor = sock_edemux;
+ }
+ }
+
entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
if (!entry)
return -ENOMEM;
@@ -187,7 +210,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
__nf_queue_entry_init_physdevs(entry);
- nf_queue_entry_get_refs(entry);
+ if (!nf_queue_entry_get_refs(entry)) {
+ kfree(entry);
+ return -ENOTCONN;
+ }
switch (entry->state.pf) {
case AF_INET:
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 3d6d49420db8..16915f8eef2b 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -236,12 +236,6 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
return 1;
}
-static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
- .len = sizeof(struct nf_conn_synproxy),
- .align = __alignof__(struct nf_conn_synproxy),
- .id = NF_CT_EXT_SYNPROXY,
-};
-
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -349,7 +343,6 @@ static int __net_init synproxy_net_init(struct net *net)
goto err2;
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
@@ -388,28 +381,12 @@ static struct pernet_operations synproxy_net_ops = {
static int __init synproxy_core_init(void)
{
- int err;
-
- err = nf_ct_extend_register(&nf_ct_synproxy_extend);
- if (err < 0)
- goto err1;
-
- err = register_pernet_subsys(&synproxy_net_ops);
- if (err < 0)
- goto err2;
-
- return 0;
-
-err2:
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
-err1:
- return err;
+ return register_pernet_subsys(&synproxy_net_ops);
}
static void __exit synproxy_core_exit(void)
{
unregister_pernet_subsys(&synproxy_net_ops);
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
}
module_init(synproxy_core_init);
@@ -428,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
iph->tos = 0;
iph->id = 0;
iph->frag_off = htons(IP_DF);
- iph->ttl = net->ipv4.sysctl_ip_default_ttl;
+ iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
iph->protocol = IPPROTO_TCP;
iph->check = 0;
iph->saddr = saddr;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c0851fec11d4..e7152d599d73 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -32,7 +32,6 @@ static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_destroy_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
-static u64 table_handle;
enum {
NFT_VALIDATE_SKIP = 0,
@@ -153,6 +152,7 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
if (trans == NULL)
return NULL;
+ INIT_LIST_HEAD(&trans->list);
trans->msg_type = msg_type;
trans->ctx = *ctx;
@@ -222,12 +222,18 @@ err_register:
}
static void nft_netdev_unregister_hooks(struct net *net,
- struct list_head *hook_list)
+ struct list_head *hook_list,
+ bool release_netdev)
{
- struct nft_hook *hook;
+ struct nft_hook *hook, *next;
- list_for_each_entry(hook, hook_list, list)
+ list_for_each_entry_safe(hook, next, hook_list, list) {
nf_unregister_net_hook(net, &hook->ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+ }
}
static int nf_tables_register_hook(struct net *net,
@@ -253,9 +259,10 @@ static int nf_tables_register_hook(struct net *net,
return nf_register_net_hook(net, &basechain->ops);
}
-static void nf_tables_unregister_hook(struct net *net,
- const struct nft_table *table,
- struct nft_chain *chain)
+static void __nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain,
+ bool release_netdev)
{
struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
@@ -270,11 +277,19 @@ static void nf_tables_unregister_hook(struct net *net,
return basechain->type->ops_unregister(net, ops);
if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
- nft_netdev_unregister_hooks(net, &basechain->hook_list);
+ nft_netdev_unregister_hooks(net, &basechain->hook_list,
+ release_netdev);
else
nf_unregister_net_hook(net, &basechain->ops);
}
+static void nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain)
+{
+ return __nf_tables_unregister_hook(net, table, chain, false);
+}
+
static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -529,6 +544,7 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
if (msg_type == NFT_MSG_NEWFLOWTABLE)
nft_activate_next(ctx->net, flowtable);
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
nft_trans_flowtable(trans) = flowtable;
nft_trans_commit_list_add_tail(ctx->net, trans);
@@ -550,6 +566,58 @@ static int nft_delflowtable(struct nft_ctx *ctx,
return err;
}
+static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
+{
+ int i;
+
+ for (i = track->regs[dreg].num_reg; i > 0; i--)
+ __nft_reg_track_cancel(track, dreg - i);
+}
+
+static void __nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr,
+ u8 dreg, u8 num_reg)
+{
+ track->regs[dreg].selector = expr;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = num_reg;
+}
+
+void nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_update(track, expr, dreg, i);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_update);
+
+void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_cancel(track, dreg);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_cancel);
+
+void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg)
+{
+ track->regs[dreg].selector = NULL;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = 0;
+}
+EXPORT_SYMBOL_GPL(__nft_reg_track_cancel);
+
/*
* Tables
*/
@@ -674,7 +742,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
return -ENOMEM;
req->done = false;
- strlcpy(req->module, module_name, MODULE_NAME_LEN);
+ strscpy(req->module, module_name, MODULE_NAME_LEN);
list_add_tail(&req->list, &nft_net->module_list);
return -EAGAIN;
@@ -820,7 +888,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = nft_net->base_seq;
+ cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -1072,6 +1140,30 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
return strcmp(obj->key.name, k->name);
}
+static bool nft_supported_family(u8 family)
+{
+ return false
+#ifdef CONFIG_NF_TABLES_INET
+ || family == NFPROTO_INET
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ || family == NFPROTO_IPV4
+#endif
+#ifdef CONFIG_NF_TABLES_ARP
+ || family == NFPROTO_ARP
+#endif
+#ifdef CONFIG_NF_TABLES_NETDEV
+ || family == NFPROTO_NETDEV
+#endif
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
+ || family == NFPROTO_BRIDGE
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+ || family == NFPROTO_IPV6
+#endif
+ ;
+}
+
static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const nla[])
{
@@ -1086,6 +1178,9 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
u32 flags = 0;
int err;
+ if (!nft_supported_family(family))
+ return -EOPNOTSUPP;
+
lockdep_assert_held(&nft_net->commit_mutex);
attr = nla[NFTA_TABLE_NAME];
table = nft_table_lookup(net, attr, family, genmask,
@@ -1113,16 +1208,16 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
}
err = -ENOMEM;
- table = kzalloc(sizeof(*table), GFP_KERNEL);
+ table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(attr, GFP_KERNEL);
+ table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
if (nla[NFTA_TABLE_USERDATA]) {
- table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL);
+ table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
if (table->udata == NULL)
goto err_table_udata;
@@ -1139,7 +1234,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
- table->handle = ++table_handle;
+ table->handle = ++nft_net->table_handle;
if (table->flags & NFT_TABLE_F_OWNER)
table->nlpid = NETLINK_CB(skb).portid;
@@ -1609,7 +1704,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = nft_net->base_seq;
+ cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -1747,16 +1842,16 @@ static void nft_chain_stats_replace(struct nft_trans *trans)
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
{
- struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
- struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+ struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0);
+ struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1);
if (g0 != g1)
kvfree(g1);
kvfree(g0);
/* should be NULL either via abort or via successful commit */
- WARN_ON_ONCE(chain->rules_next);
- kvfree(chain->rules_next);
+ WARN_ON_ONCE(chain->blob_next);
+ kvfree(chain->blob_next);
}
void nf_tables_chain_destroy(struct nft_ctx *ctx)
@@ -1803,7 +1898,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
struct nft_hook *hook;
int err;
- hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
+ hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
if (!hook) {
err = -ENOMEM;
goto err_hook_alloc;
@@ -1820,7 +1915,6 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
goto err_hook_dev;
}
hook->ops.dev = dev;
- hook->inactive = false;
return hook;
@@ -2002,23 +2096,38 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
struct nft_rules_old {
struct rcu_head h;
- struct nft_rule **start;
+ struct nft_rule_blob *blob;
};
-static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
- unsigned int alloc)
+static void nft_last_rule(struct nft_rule_blob *blob, const void *ptr)
{
- if (alloc > INT_MAX)
+ struct nft_rule_dp *prule;
+
+ prule = (struct nft_rule_dp *)ptr;
+ prule->is_last = 1;
+ /* blob size does not include the trailer rule */
+}
+
+static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size)
+{
+ struct nft_rule_blob *blob;
+
+ /* size must include room for the last rule */
+ if (size < offsetof(struct nft_rule_dp, data))
return NULL;
- alloc += 1; /* NULL, ends rules */
- if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+ size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rules_old);
+ if (size > INT_MAX)
return NULL;
- alloc *= sizeof(struct nft_rule *);
- alloc += sizeof(struct nft_rules_old);
+ blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return NULL;
+
+ blob->size = 0;
+ nft_last_rule(blob, blob->data);
- return kvmalloc(alloc, GFP_KERNEL);
+ return blob;
}
static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
@@ -2057,8 +2166,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
chain->flags |= NFT_CHAIN_BASE | flags;
basechain->policy = NF_ACCEPT;
if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
- nft_chain_offload_priority(basechain) < 0)
+ !nft_chain_offload_support(basechain)) {
+ list_splice_init(&basechain->hook_list, &hook->list);
return -EOPNOTSUPP;
+ }
flow_block_init(&basechain->flow_block);
@@ -2088,18 +2199,19 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
struct nft_base_chain *basechain;
- struct nft_stats __percpu *stats;
struct net *net = ctx->net;
char name[NFT_NAME_MAXLEN];
+ struct nft_rule_blob *blob;
struct nft_trans *trans;
struct nft_chain *chain;
- struct nft_rule **rules;
+ unsigned int data_size;
int err;
if (table->use == UINT_MAX)
return -EOVERFLOW;
if (nla[NFTA_CHAIN_HOOK]) {
+ struct nft_stats __percpu *stats = NULL;
struct nft_chain_hook hook;
if (flags & NFT_CHAIN_BINDING)
@@ -2110,7 +2222,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (err < 0)
return err;
- basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
if (basechain == NULL) {
nft_chain_release_hook(&hook);
return -ENOMEM;
@@ -2125,22 +2237,24 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
return PTR_ERR(stats);
}
rcu_assign_pointer(basechain->stats, stats);
- static_branch_inc(&nft_counters_enabled);
}
err = nft_basechain_init(basechain, family, &hook, flags);
if (err < 0) {
nft_chain_release_hook(&hook);
kfree(basechain);
+ free_percpu(stats);
return err;
}
+ if (stats)
+ static_branch_inc(&nft_counters_enabled);
} else {
if (flags & NFT_CHAIN_BASE)
return -EINVAL;
if (flags & NFT_CHAIN_HW_OFFLOAD)
return -EOPNOTSUPP;
- chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
if (chain == NULL)
return -ENOMEM;
@@ -2153,7 +2267,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
chain->table = table;
if (nla[NFTA_CHAIN_NAME]) {
- chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
} else {
if (!(flags & NFT_CHAIN_BINDING)) {
err = -EINVAL;
@@ -2161,7 +2275,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
}
snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
- chain->name = kstrdup(name, GFP_KERNEL);
+ chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
}
if (!chain->name) {
@@ -2170,7 +2284,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
}
if (nla[NFTA_CHAIN_USERDATA]) {
- chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL);
+ chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
if (chain->udata == NULL) {
err = -ENOMEM;
goto err_destroy_chain;
@@ -2178,15 +2292,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
- rules = nf_tables_chain_alloc_rules(chain, 0);
- if (!rules) {
+ data_size = offsetof(struct nft_rule_dp, data); /* last rule */
+ blob = nf_tables_chain_alloc_rules(data_size);
+ if (!blob) {
err = -ENOMEM;
goto err_destroy_chain;
}
- *rules = NULL;
- rcu_assign_pointer(chain->rules_gen_0, rules);
- rcu_assign_pointer(chain->rules_gen_1, rules);
+ RCU_INIT_POINTER(chain->blob_gen_0, blob);
+ RCU_INIT_POINTER(chain->blob_gen_1, blob);
err = nf_tables_register_hook(net, table, chain);
if (err < 0)
@@ -2333,7 +2447,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
char *name;
err = -ENOMEM;
- name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
if (!name)
goto err;
@@ -2362,6 +2476,7 @@ err:
}
static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -2372,6 +2487,7 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
struct nft_chain *chain = trans->ctx.chain;
if (trans->msg_type == NFT_MSG_NEWCHAIN &&
+ chain->table == table &&
id == nft_trans_chain_id(trans))
return chain;
}
@@ -2461,6 +2577,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain != NULL) {
+ if (chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
+
if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
@@ -2778,27 +2897,31 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
err = nf_tables_expr_parse(ctx, nla, &expr_info);
if (err < 0)
- goto err1;
+ goto err_expr_parse;
+
+ err = -EOPNOTSUPP;
+ if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_expr_stateful;
err = -ENOMEM;
- expr = kzalloc(expr_info.ops->size, GFP_KERNEL);
+ expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
if (expr == NULL)
- goto err2;
+ goto err_expr_stateful;
err = nf_tables_newexpr(ctx, &expr_info, expr);
if (err < 0)
- goto err3;
+ goto err_expr_new;
return expr;
-err3:
+err_expr_new:
kfree(expr);
-err2:
+err_expr_stateful:
owner = expr_info.ops->type->owner;
if (expr_info.ops->type->release_ops)
expr_info.ops->type->release_ops(expr_info.ops);
module_put(owner);
-err1:
+err_expr_parse:
return ERR_PTR(err);
}
@@ -3032,7 +3155,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = nft_net->base_seq;
+ cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -3226,6 +3349,8 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (err < 0)
return err;
}
+
+ cond_resched();
}
return 0;
@@ -3255,6 +3380,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
}
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla);
#define NFT_RULE_MAXEXPRS 128
@@ -3301,7 +3427,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return -EOPNOTSUPP;
} else if (nla[NFTA_RULE_CHAIN_ID]) {
- chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]);
+ chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
return PTR_ERR(chain);
@@ -3343,7 +3469,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return PTR_ERR(old_rule);
}
} else if (nla[NFTA_RULE_POSITION_ID]) {
- old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]);
+ old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
return PTR_ERR(old_rule);
@@ -3389,7 +3515,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
}
err = -ENOMEM;
- rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
if (rule == NULL)
goto err_release_expr;
@@ -3488,6 +3614,7 @@ err_release_expr:
}
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -3498,6 +3625,7 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
struct nft_rule *rule = nft_trans_rule(trans);
if (trans->msg_type == NFT_MSG_NEWRULE &&
+ trans->ctx.chain == chain &&
id == nft_trans_rule_id(trans))
return rule;
}
@@ -3547,7 +3675,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
err = nft_delrule(&ctx, rule);
} else if (nla[NFTA_RULE_ID]) {
- rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+ rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
return PTR_ERR(rule);
@@ -3726,6 +3854,7 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
}
static struct nft_set *nft_set_lookup_byid(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -3737,6 +3866,7 @@ static struct nft_set *nft_set_lookup_byid(const struct net *net,
struct nft_set *set = nft_trans_set(trans);
if (id == nft_trans_set_id(trans) &&
+ set->table == table &&
nft_active_genmask(set, genmask))
return set;
}
@@ -3757,7 +3887,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
if (!nla_set_id)
return set;
- set = nft_set_lookup_byid(net, nla_set_id, genmask);
+ set = nft_set_lookup_byid(net, table, nla_set_id, genmask);
}
return set;
}
@@ -3783,7 +3913,7 @@ cont:
list_for_each_entry(i, &ctx->table->sets, list) {
int tmp;
- if (!nft_is_active_next(ctx->net, set))
+ if (!nft_is_active_next(ctx->net, i))
continue;
if (!sscanf(i->name, name, &tmp))
continue;
@@ -3802,7 +3932,7 @@ cont:
free_page((unsigned long)inuse);
}
- set->name = kasprintf(GFP_KERNEL, name, min + n);
+ set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
if (!set->name)
return -ENOMEM;
@@ -4009,7 +4139,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = nft_net->base_seq;
+ cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC &&
@@ -4147,6 +4277,9 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
u32 len;
int err;
+ if (desc->field_count >= ARRAY_SIZE(desc->field_len))
+ return -E2BIG;
+
err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
nft_concat_policy, NULL);
if (err < 0)
@@ -4156,9 +4289,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
return -EINVAL;
len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
-
- if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT)
- return -E2BIG;
+ if (!len || len > U8_MAX)
+ return -EINVAL;
desc->field_len[desc->field_count++] = len;
@@ -4169,7 +4301,8 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
const struct nlattr *nla)
{
struct nlattr *attr;
- int rem, err;
+ u32 num_regs = 0;
+ int rem, err, i;
nla_for_each_nested(attr, nla, rem) {
if (nla_type(attr) != NFTA_LIST_ELEM)
@@ -4180,6 +4313,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
return err;
}
+ for (i = 0; i < desc->field_count; i++)
+ num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32));
+
+ if (num_regs > NFT_REG32_COUNT)
+ return -E2BIG;
+
return 0;
}
@@ -4318,6 +4457,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
+
+ if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT))
+ return -EINVAL;
+ } else if (flags & NFT_SET_CONCAT) {
+ return -EINVAL;
}
if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
@@ -4366,11 +4510,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
alloc_size = sizeof(*set) + size + udlen;
if (alloc_size < size || alloc_size > INT_MAX)
return -ENOMEM;
- set = kvzalloc(alloc_size, GFP_KERNEL);
+ set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
if (!set)
return -ENOMEM;
- name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
if (!name) {
err = -ENOMEM;
goto err_set_name;
@@ -4481,12 +4625,12 @@ struct nft_set_elem_catchall {
static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
struct nft_set *set)
{
- struct nft_set_elem_catchall *catchall;
+ struct nft_set_elem_catchall *next, *catchall;
- list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
list_del_rcu(&catchall->list);
nft_set_elem_destroy(set, catchall->elem, true);
- kfree_rcu(catchall);
+ kfree_rcu(catchall, rcu);
}
}
@@ -4928,6 +5072,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
+ cb->seq = READ_ONCE(nft_net->base_seq);
+
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
dump_ctx->ctx.family != table->family)
@@ -5063,6 +5209,9 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
if (!(set->flags & NFT_SET_INTERVAL) &&
*flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
+ if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
+ (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
return 0;
}
@@ -5070,19 +5219,13 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data *key, struct nlattr *attr)
{
- struct nft_data_desc desc;
- int err;
-
- err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr);
- if (err < 0)
- return err;
-
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) {
- nft_data_release(key, desc.type);
- return -EINVAL;
- }
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = NFT_DATA_VALUE_MAXLEN,
+ .len = set->klen,
+ };
- return 0;
+ return nft_data_init(ctx, key, &desc, attr);
}
static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
@@ -5090,18 +5233,19 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data *data,
struct nlattr *attr)
{
- int err;
+ u32 dtype;
- err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr);
- if (err < 0)
- return err;
+ if (set->dtype == NFT_DATA_VERDICT)
+ dtype = NFT_DATA_VERDICT;
+ else
+ dtype = NFT_DATA_VALUE;
- if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) {
- nft_data_release(data, desc->type);
- return -EINVAL;
- }
+ desc->type = dtype;
+ desc->size = NFT_DATA_VALUE_MAXLEN;
+ desc->len = set->dlen;
+ desc->flags = NFT_DATA_DESC_SETELEM;
- return 0;
+ return nft_data_init(ctx, data, desc, attr);
}
static void *nft_setelem_catchall_get(const struct net *net,
@@ -5249,8 +5393,10 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_get_set_elem(&ctx, set, attr);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
break;
+ }
}
return err;
@@ -5318,9 +5464,6 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
return expr;
err = -EOPNOTSUPP;
- if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err_set_elem_expr;
-
if (expr->ops->type->flags & NFT_EXPR_GC) {
if (set->flags & NFT_SET_TIMEOUT)
goto err_set_elem_expr;
@@ -5336,6 +5479,27 @@ err_set_elem_expr:
return ERR_PTR(err);
}
+static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
+{
+ len += nft_set_ext_types[id].len;
+ if (len > tmpl->ext_len[id] ||
+ len > U8_MAX)
+ return -1;
+
+ return 0;
+}
+
+static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
+ void *to, const void *from, u32 len)
+{
+ if (nft_set_ext_check(tmpl, id, len) < 0)
+ return -1;
+
+ memcpy(to, from, len);
+
+ return 0;
+}
+
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end,
@@ -5346,17 +5510,26 @@ void *nft_set_elem_init(const struct nft_set *set,
elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
if (elem == NULL)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ext = nft_set_elem_ext(set, elem);
nft_set_ext_init(ext, tmpl);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY))
- memcpy(nft_set_ext_key(ext), key, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
- memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- memcpy(nft_set_ext_data(ext), data, set->dlen);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
+ nft_set_ext_key(ext), key, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
+ nft_set_ext_key_end(ext), key_end, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
+ nft_set_ext_data(ext), data, set->dlen) < 0)
+ goto err_ext_check;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
*nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
if (expiration == 0)
@@ -5366,6 +5539,11 @@ void *nft_set_elem_init(const struct nft_set *set,
*nft_set_ext_timeout(ext) = timeout;
return elem;
+
+err_ext_check:
+ kfree(elem);
+
+ return ERR_PTR(-EINVAL);
}
static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
@@ -5431,13 +5609,13 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
int err, i, k;
for (i = 0; i < set->num_exprs; i++) {
- expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL);
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
if (!expr)
goto err_expr;
err = nft_expr_clone(expr, set->exprs[i]);
if (err < 0) {
- nft_expr_destroy(ctx, expr);
+ kfree(expr);
goto err_expr;
}
expr_array[i] = expr;
@@ -5453,14 +5631,25 @@ err_expr:
}
static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
+ const struct nft_set_ext_tmpl *tmpl,
const struct nft_set_ext *ext,
struct nft_expr *expr_array[],
u32 num_exprs)
{
struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ u32 len = sizeof(struct nft_set_elem_expr);
struct nft_expr *expr;
int i, err;
+ if (num_exprs == 0)
+ return 0;
+
+ for (i = 0; i < num_exprs; i++)
+ len += expr_array[i]->ops->size;
+
+ if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
+ return -EINVAL;
+
for (i = 0; i < num_exprs; i++) {
expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
err = nft_expr_clone(expr, expr_array[i]);
@@ -5653,7 +5842,7 @@ static void nft_setelem_catchall_remove(const struct net *net,
list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
if (catchall->elem == elem->priv) {
list_del_rcu(&catchall->list);
- kfree_rcu(catchall);
+ kfree_rcu(catchall, rcu);
break;
}
}
@@ -5669,6 +5858,25 @@ static void nft_setelem_remove(const struct net *net,
set->ops->remove(net, set, elem);
}
+static bool nft_setelem_valid_key_end(const struct nft_set *set,
+ struct nlattr **nla, u32 flags)
+{
+ if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
+ (NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
+ if (flags & NFT_SET_ELEM_INTERVAL_END)
+ return false;
+
+ if (nla[NFTA_SET_ELEM_KEY_END] &&
+ flags & NFT_SET_ELEM_CATCHALL)
+ return false;
+ } else {
+ if (nla[NFTA_SET_ELEM_KEY_END])
+ return false;
+ }
+
+ return true;
+}
+
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
@@ -5704,8 +5912,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
return -EINVAL;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -5716,6 +5927,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return -EINVAL;
}
+ if (set->flags & NFT_SET_OBJECT) {
+ if (!nla[NFTA_SET_ELEM_OBJREF] &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_OBJREF])
+ return -EINVAL;
+ }
+
+ if (!nft_setelem_valid_key_end(set, nla, flags))
+ return -EINVAL;
+
if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
(nla[NFTA_SET_ELEM_DATA] ||
nla[NFTA_SET_ELEM_OBJREF] ||
@@ -5723,6 +5946,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nla[NFTA_SET_ELEM_EXPIRATION] ||
nla[NFTA_SET_ELEM_USERDATA] ||
nla[NFTA_SET_ELEM_EXPR] ||
+ nla[NFTA_SET_ELEM_KEY_END] ||
nla[NFTA_SET_ELEM_EXPRESSIONS]))
return -EINVAL;
@@ -5814,7 +6038,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
goto err_set_elem_expr;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto err_parse_key;
}
if (nla[NFTA_SET_ELEM_KEY_END]) {
@@ -5823,29 +6049,34 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
goto err_parse_key;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (timeout > 0) {
- nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
- if (timeout != set->timeout)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
+ if (err < 0)
+ goto err_parse_key_end;
+
+ if (timeout != set->timeout) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ if (err < 0)
+ goto err_parse_key_end;
+ }
}
if (num_exprs) {
for (i = 0; i < num_exprs; i++)
size += expr_array[i]->ops->size;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
- sizeof(struct nft_set_elem_expr) +
- size);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
- if (!(set->flags & NFT_SET_OBJECT)) {
- err = -EINVAL;
- goto err_parse_key_end;
- }
obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
@@ -5853,7 +6084,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = PTR_ERR(obj);
goto err_parse_key_end;
}
- nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
@@ -5887,7 +6120,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
NFT_VALIDATE_NEED);
}
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ if (err < 0)
+ goto err_parse_data;
}
/* The full maximum length of userdata can exceed the maximum
@@ -5897,22 +6132,31 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
ulen = 0;
if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
- if (ulen > 0)
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
- ulen);
+ if (ulen > 0) {
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
+ ulen);
+ if (err < 0)
+ goto err_parse_data;
+ }
}
- err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, elem.data.val.data,
- timeout, expiration, GFP_KERNEL);
- if (elem.priv == NULL)
+ timeout, expiration, GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
goto err_parse_data;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
+
if (ulen > 0) {
+ if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
+ err = -EINVAL;
+ goto err_elem_userdata;
+ }
udata = nft_set_ext_userdata(ext);
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
@@ -5921,14 +6165,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
*nft_set_ext_obj(ext) = obj;
obj->use++;
}
- err = nft_set_elem_expr_setup(ctx, ext, expr_array, num_exprs);
+ err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
if (err < 0)
- goto err_elem_expr;
+ goto err_elem_free;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL) {
err = -ENOMEM;
- goto err_elem_expr;
+ goto err_elem_free;
}
ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
@@ -5974,10 +6218,10 @@ err_set_full:
nft_setelem_remove(ctx->net, set, &elem);
err_element_clash:
kfree(trans);
-err_elem_expr:
+err_elem_free:
if (obj)
obj->use--;
-
+err_elem_userdata:
nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
@@ -6030,8 +6274,10 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return err;
+ }
}
if (nft_net->validate_state == NFT_VALIDATE_DO)
@@ -6123,10 +6369,16 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
return -EINVAL;
+ if (!nft_setelem_valid_key_end(set, nla, flags))
+ return -EINVAL;
+
nft_set_ext_prepare(&tmpl);
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
if (nla[NFTA_SET_ELEM_KEY]) {
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
@@ -6134,24 +6386,30 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto fail_elem;
}
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
nla[NFTA_SET_ELEM_KEY_END]);
if (err < 0)
- return err;
+ goto fail_elem;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto fail_elem_key_end;
}
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, NULL, 0, 0,
- GFP_KERNEL);
- if (elem.priv == NULL)
- goto fail_elem;
+ GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
+ goto fail_elem_key_end;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
@@ -6175,6 +6433,8 @@ fail_ops:
kfree(trans);
fail_trans:
kfree(elem.priv);
+fail_elem_key_end:
+ nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
fail_elem:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
return err;
@@ -6301,8 +6561,10 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
break;
+ }
}
return err;
}
@@ -6461,7 +6723,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
}
err = -ENOMEM;
- obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
+ obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
if (!obj)
goto err2;
@@ -6535,12 +6797,15 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
{
struct nft_object *newobj;
struct nft_trans *trans;
- int err;
+ int err = -ENOMEM;
+
+ if (!try_module_get(type->owner))
+ return -ENOENT;
trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
sizeof(struct nft_trans_obj));
if (!trans)
- return -ENOMEM;
+ goto err_trans;
newobj = nft_obj_init(ctx, type, attr);
if (IS_ERR(newobj)) {
@@ -6557,6 +6822,8 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
err_free_trans:
kfree(trans);
+err_trans:
+ module_put(type->owner);
return err;
}
@@ -6622,7 +6889,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
- obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+ obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
if (!obj->key.name) {
err = -ENOMEM;
goto err_strdup;
@@ -6721,7 +6988,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = nft_net->base_seq;
+ cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -7191,13 +7458,25 @@ static void nft_unregister_flowtable_hook(struct net *net,
FLOW_BLOCK_UNBIND);
}
-static void nft_unregister_flowtable_net_hooks(struct net *net,
- struct list_head *hook_list)
+static void __nft_unregister_flowtable_net_hooks(struct net *net,
+ struct list_head *hook_list,
+ bool release_netdev)
{
- struct nft_hook *hook;
+ struct nft_hook *hook, *next;
- list_for_each_entry(hook, hook_list, list)
+ list_for_each_entry_safe(hook, next, hook_list, list) {
nf_unregister_net_hook(net, &hook->ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+ }
+}
+
+static void nft_unregister_flowtable_net_hooks(struct net *net,
+ struct list_head *hook_list)
+{
+ __nft_unregister_flowtable_net_hooks(net, hook_list, false);
}
static int nft_register_flowtable_net_hooks(struct net *net,
@@ -7290,11 +7569,15 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
if (nla[NFTA_FLOWTABLE_FLAGS]) {
flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
- if (flags & ~NFT_FLOWTABLE_MASK)
- return -EOPNOTSUPP;
+ if (flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
- (flags & NFT_FLOWTABLE_HW_OFFLOAD))
- return -EOPNOTSUPP;
+ (flags & NFT_FLOWTABLE_HW_OFFLOAD)) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
} else {
flags = flowtable->data.flags;
}
@@ -7383,7 +7666,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
+ flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
if (!flowtable)
return -ENOMEM;
@@ -7391,7 +7674,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
flowtable->handle = nf_tables_alloc_handle(table);
INIT_LIST_HEAD(&flowtable->hook_list);
- flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+ flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
if (!flowtable->name) {
err = -ENOMEM;
goto err1;
@@ -7475,6 +7758,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
{
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
+ LIST_HEAD(flowtable_del_list);
struct nft_hook *this, *hook;
struct nft_trans *trans;
int err;
@@ -7490,7 +7774,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
err = -ENOENT;
goto err_flowtable_del_hook;
}
- hook->inactive = true;
+ list_move(&hook->list, &flowtable_del_list);
}
trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
@@ -7503,6 +7787,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
nft_trans_flowtable(trans) = flowtable;
nft_trans_flowtable_update(trans) = true;
INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+ list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans));
nft_flowtable_hook_release(&flowtable_hook);
nft_trans_commit_list_add_tail(ctx->net, trans);
@@ -7510,13 +7795,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx,
return 0;
err_flowtable_del_hook:
- list_for_each_entry(this, &flowtable_hook.list, list) {
- hook = nft_hook_list_find(&flowtable->hook_list, this);
- if (!hook)
- break;
-
- hook->inactive = false;
- }
+ list_splice(&flowtable_del_list, &flowtable->hook_list);
nft_flowtable_hook_release(&flowtable_hook);
return err;
@@ -7641,7 +7920,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = nft_net->base_seq;
+ cb->seq = READ_ONCE(nft_net->base_seq);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -8169,7 +8448,7 @@ static void nft_obj_commit_update(struct nft_trans *trans)
if (obj->ops->update)
obj->ops->update(obj, newobj);
- kfree(newobj);
+ nft_obj_destroy(&trans->ctx, newobj);
}
static void nft_commit_release(struct nft_trans *trans)
@@ -8239,34 +8518,87 @@ void nf_tables_trans_destroy_flush_work(void)
}
EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
+static bool nft_expr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ return false;
+}
+
static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
+ const struct nft_expr *expr, *last;
+ struct nft_regs_track track = {};
+ unsigned int size, data_size;
+ void *data, *data_boundary;
+ struct nft_rule_dp *prule;
struct nft_rule *rule;
- unsigned int alloc = 0;
- int i;
/* already handled or inactive chain? */
- if (chain->rules_next || !nft_is_active_next(net, chain))
+ if (chain->blob_next || !nft_is_active_next(net, chain))
return 0;
- rule = list_entry(&chain->rules, struct nft_rule, list);
- i = 0;
-
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- alloc++;
+ data_size = 0;
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule)) {
+ data_size += sizeof(*prule) + rule->dlen;
+ if (data_size > INT_MAX)
+ return -ENOMEM;
+ }
}
+ data_size += offsetof(struct nft_rule_dp, data); /* last rule */
- chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
- if (!chain->rules_next)
+ chain->blob_next = nf_tables_chain_alloc_rules(data_size);
+ if (!chain->blob_next)
return -ENOMEM;
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- chain->rules_next[i++] = rule;
+ data = (void *)chain->blob_next->data;
+ data_boundary = data + data_size;
+ size = 0;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (!nft_is_active_next(net, rule))
+ continue;
+
+ prule = (struct nft_rule_dp *)data;
+ data += offsetof(struct nft_rule_dp, data);
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ size = 0;
+ track.last = nft_expr_last(rule);
+ nft_rule_for_each_expr(expr, last, rule) {
+ track.cur = expr;
+
+ if (nft_expr_reduce(&track, expr)) {
+ expr = track.cur;
+ continue;
+ }
+
+ if (WARN_ON_ONCE(data + expr->ops->size > data_boundary))
+ return -ENOMEM;
+
+ memcpy(data + size, expr, expr->ops->size);
+ size += expr->ops->size;
+ }
+ if (WARN_ON_ONCE(size >= 1 << 12))
+ return -ENOMEM;
+
+ prule->handle = rule->handle;
+ prule->dlen = size;
+ prule->is_last = 0;
+
+ data += size;
+ size = 0;
+ chain->blob_next->size += (unsigned long)(data - (void *)prule);
}
- chain->rules_next[i] = NULL;
+ prule = (struct nft_rule_dp *)data;
+ data += offsetof(struct nft_rule_dp, data);
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ nft_last_rule(chain->blob_next, prule);
+
return 0;
}
@@ -8280,8 +8612,8 @@ static void nf_tables_commit_chain_prepare_cancel(struct net *net)
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- kvfree(chain->rules_next);
- chain->rules_next = NULL;
+ kvfree(chain->blob_next);
+ chain->blob_next = NULL;
}
}
}
@@ -8290,38 +8622,34 @@ static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
{
struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
- kvfree(o->start);
+ kvfree(o->blob);
}
-static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
- struct nft_rule **r = rules;
struct nft_rules_old *old;
- while (*r)
- r++;
-
- r++; /* rcu_head is after end marker */
- old = (void *) r;
- old->start = rules;
+ /* rcu_head is after end marker */
+ old = (void *)blob + sizeof(*blob) + blob->size;
+ old->blob = blob;
call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
}
static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
- struct nft_rule **g0, **g1;
+ struct nft_rule_blob *g0, *g1;
bool next_genbit;
next_genbit = nft_gencursor_next(net);
- g0 = rcu_dereference_protected(chain->rules_gen_0,
+ g0 = rcu_dereference_protected(chain->blob_gen_0,
lockdep_commit_lock_is_held(net));
- g1 = rcu_dereference_protected(chain->rules_gen_1,
+ g1 = rcu_dereference_protected(chain->blob_gen_1,
lockdep_commit_lock_is_held(net));
/* No changes to this chain? */
- if (chain->rules_next == NULL) {
+ if (chain->blob_next == NULL) {
/* chain had no change in last or next generation */
if (g0 == g1)
return;
@@ -8330,10 +8658,10 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
* one uses same rules as current generation.
*/
if (next_genbit) {
- rcu_assign_pointer(chain->rules_gen_1, g0);
+ rcu_assign_pointer(chain->blob_gen_1, g0);
nf_tables_commit_chain_free_rules_old(g1);
} else {
- rcu_assign_pointer(chain->rules_gen_0, g1);
+ rcu_assign_pointer(chain->blob_gen_0, g1);
nf_tables_commit_chain_free_rules_old(g0);
}
@@ -8341,11 +8669,11 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
}
if (next_genbit)
- rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_1, chain->blob_next);
else
- rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_0, chain->blob_next);
- chain->rules_next = NULL;
+ chain->blob_next = NULL;
if (g0 == g1)
return;
@@ -8371,17 +8699,6 @@ void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
-static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable,
- struct list_head *hook_list)
-{
- struct nft_hook *hook, *next;
-
- list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- if (hook->inactive)
- list_move(&hook->list, hook_list);
- }
-}
-
static void nf_tables_module_autoload_cleanup(struct net *net)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -8533,6 +8850,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
+ unsigned int base_seq;
LIST_HEAD(adl);
int err;
@@ -8582,9 +8900,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point.
*/
- while (++nft_net->base_seq == 0)
+ base_seq = READ_ONCE(nft_net->base_seq);
+ while (++base_seq == 0)
;
+ WRITE_ONCE(nft_net->base_seq, base_seq);
+
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
@@ -8636,6 +8957,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_rule_notify(&trans->ctx,
nft_trans_rule(trans),
NFT_MSG_NEWRULE);
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_DELRULE:
@@ -8646,6 +8970,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_rule_expr_deactivate(&trans->ctx,
nft_trans_rule(trans),
NFT_TRANS_COMMIT);
+
+ if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
nft_clear(net, nft_trans_set(trans));
@@ -8726,8 +9053,6 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
break;
case NFT_MSG_DELFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
- nft_flowtable_hooks_del(nft_trans_flowtable(trans),
- &nft_trans_flowtable_hooks(trans));
nf_tables_flowtable_notify(&trans->ctx,
nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans),
@@ -8808,7 +9133,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
- struct nft_hook *hook;
if (action == NFNL_ABORT_VALIDATE &&
nf_tables_validate(net) < 0)
@@ -8914,7 +9238,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
- kfree(nft_trans_obj_newobj(trans));
+ nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans));
nft_trans_destroy(trans);
} else {
trans->ctx.table->use--;
@@ -8939,8 +9263,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
break;
case NFT_MSG_DELFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
- list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list)
- hook->inactive = false;
+ list_splice(&nft_trans_flowtable_hooks(trans),
+ &nft_trans_flowtable(trans)->hook_list);
} else {
trans->ctx.table->use++;
nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
@@ -9203,17 +9527,23 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
}
EXPORT_SYMBOL_GPL(nft_parse_u32_check);
-static unsigned int nft_parse_register(const struct nlattr *attr)
+static int nft_parse_register(const struct nlattr *attr, u32 *preg)
{
unsigned int reg;
reg = ntohl(nla_get_be32(attr));
switch (reg) {
case NFT_REG_VERDICT...NFT_REG_4:
- return reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ break;
+ case NFT_REG32_00...NFT_REG32_15:
+ *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ break;
default:
- return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ return -ERANGE;
}
+
+ return 0;
}
/**
@@ -9255,7 +9585,10 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len)
u32 reg;
int err;
- reg = nft_parse_register(attr);
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
err = nft_validate_register_load(reg, len);
if (err < 0)
return err;
@@ -9310,7 +9643,10 @@ int nft_parse_register_store(const struct nft_ctx *ctx,
int err;
u32 reg;
- reg = nft_parse_register(attr);
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
err = nft_validate_register_store(ctx, reg, data, type, len);
if (err < 0)
return err;
@@ -9366,7 +9702,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
tb[NFTA_VERDICT_CHAIN],
genmask);
} else if (tb[NFTA_VERDICT_CHAIN_ID]) {
- chain = nft_chain_lookup_byid(ctx->net,
+ chain = nft_chain_lookup_byid(ctx->net, ctx->table,
tb[NFTA_VERDICT_CHAIN_ID]);
if (IS_ERR(chain))
return PTR_ERR(chain);
@@ -9378,6 +9714,11 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
return -EOPNOTSUPP;
+ if (nft_chain_is_bound(chain))
+ return -EINVAL;
+ if (desc->flags & NFT_DATA_DESC_SETELEM &&
+ chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
chain->use++;
data->verdict.chain = chain;
@@ -9385,7 +9726,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
}
desc->len = sizeof(data->verdict);
- desc->type = NFT_DATA_VERDICT;
+
return 0;
}
@@ -9438,20 +9779,25 @@ nla_put_failure:
}
static int nft_value_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
- struct nft_data_desc *desc, const struct nlattr *nla)
+ struct nft_data *data, struct nft_data_desc *desc,
+ const struct nlattr *nla)
{
unsigned int len;
len = nla_len(nla);
if (len == 0)
return -EINVAL;
- if (len > size)
+ if (len > desc->size)
return -EOVERFLOW;
+ if (desc->len) {
+ if (len != desc->len)
+ return -EINVAL;
+ } else {
+ desc->len = len;
+ }
nla_memcpy(data->data, nla, len);
- desc->type = NFT_DATA_VALUE;
- desc->len = len;
+
return 0;
}
@@ -9471,7 +9817,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
*
* @ctx: context of the expression using the data
* @data: destination struct nft_data
- * @size: maximum data length
* @desc: data description
* @nla: netlink attribute containing data
*
@@ -9481,24 +9826,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
* The caller can indicate that it only wants to accept data of type
* NFT_DATA_VALUE by passing NULL for the ctx argument.
*/
-int nft_data_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla)
{
struct nlattr *tb[NFTA_DATA_MAX + 1];
int err;
+ if (WARN_ON_ONCE(!desc->size))
+ return -EINVAL;
+
err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
nft_data_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_DATA_VALUE])
- return nft_value_init(ctx, data, size, desc,
- tb[NFTA_DATA_VALUE]);
- if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
- return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
- return -EINVAL;
+ if (tb[NFTA_DATA_VALUE]) {
+ if (desc->type != NFT_DATA_VALUE)
+ return -EINVAL;
+
+ err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
+ if (desc->type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ } else {
+ err = -EINVAL;
+ }
+
+ return err;
}
EXPORT_SYMBOL_GPL(nft_data_init);
@@ -9574,10 +9930,14 @@ EXPORT_SYMBOL_GPL(__nft_release_basechain);
static void __nft_release_hook(struct net *net, struct nft_table *table)
{
+ struct nft_flowtable *flowtable;
struct nft_chain *chain;
list_for_each_entry(chain, &table->chains, list)
- nf_tables_unregister_hook(net, table, chain);
+ __nf_tables_unregister_hook(net, table, chain, true);
+ list_for_each_entry(flowtable, &table->flowtables, list)
+ __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list,
+ true);
}
static void __nft_release_hooks(struct net *net)
@@ -9670,6 +10030,8 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
nft_net = nft_pernet(net);
deleted = 0;
mutex_lock(&nft_net->commit_mutex);
+ if (!list_empty(&nf_tables_destroy_list))
+ rcu_barrier();
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
@@ -9716,7 +10078,11 @@ static int __net_init nf_tables_init_net(struct net *net)
static void __net_exit nf_tables_pre_exit_net(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ mutex_lock(&nft_net->commit_mutex);
__nft_release_hooks(net);
+ mutex_unlock(&nft_net->commit_mutex);
}
static void __net_exit nf_tables_exit_net(struct net *net)
@@ -9724,7 +10090,8 @@ static void __net_exit nf_tables_exit_net(struct net *net)
struct nftables_pernet *nft_net = nft_pernet(net);
mutex_lock(&nft_net->commit_mutex);
- if (!list_empty(&nft_net->commit_list))
+ if (!list_empty(&nft_net->commit_list) ||
+ !list_empty(&nft_net->module_list))
__nf_tables_abort(net, NFNL_ABORT_NONE);
__nft_release_tables(net);
mutex_unlock(&nft_net->commit_mutex);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index adc348056076..cee3e4e905ec 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -25,9 +25,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
enum nft_trace_types type)
{
- const struct nft_pktinfo *pkt = info->pkt;
-
- if (!info->trace || !pkt->skb->nf_trace)
+ if (!info->trace || !info->nf_trace)
return;
info->chain = chain;
@@ -36,17 +34,28 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
nft_trace_notify(info);
}
-static inline void nft_trace_packet(struct nft_traceinfo *info,
+static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
const struct nft_chain *chain,
- const struct nft_rule *rule,
+ const struct nft_rule_dp *rule,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
+ info->nf_trace = pkt->skb->nf_trace;
info->rule = rule;
__nft_trace_packet(info, chain, type);
}
}
+static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info)
+{
+ if (static_branch_unlikely(&nft_trace_enabled)) {
+ if (info->trace)
+ info->nf_trace = pkt->skb->nf_trace;
+ }
+}
+
static void nft_bitwise_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs)
{
@@ -67,6 +76,57 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static void nft_cmp16_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ const u64 *reg_data = (const u64 *)&regs->data[priv->sreg];
+ const u64 *mask = (const u64 *)&priv->mask;
+ const u64 *data = (const u64 *)&priv->data;
+
+ if (((reg_data[0] & mask[0]) == data[0] &&
+ ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv)
+ return;
+ regs->verdict.code = NFT_BREAK;
+}
+
+static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
+ const struct nft_chain *chain,
+ const struct nft_regs *regs)
+{
+ enum nft_trace_types type;
+
+ switch (regs->verdict.code) {
+ case NFT_CONTINUE:
+ case NFT_RETURN:
+ type = NFT_TRACETYPE_RETURN;
+ break;
+ case NF_STOLEN:
+ type = NFT_TRACETYPE_RULE;
+ /* can't access skb->nf_trace; use copy */
+ break;
+ default:
+ type = NFT_TRACETYPE_RULE;
+
+ if (info->trace)
+ info->nf_trace = info->pkt->skb->nf_trace;
+ break;
+ }
+
+ __nft_trace_packet(info, chain, type);
+}
+
+static inline void nft_trace_verdict(struct nft_traceinfo *info,
+ const struct nft_chain *chain,
+ const struct nft_rule_dp *rule,
+ const struct nft_regs *regs)
+{
+ if (static_branch_unlikely(&nft_trace_enabled)) {
+ info->rule = rule;
+ __nft_trace_verdict(info, chain, regs);
+ }
+}
+
static bool nft_payload_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -110,7 +170,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
base_chain = nft_base_chain(chain);
- rcu_read_lock();
pstats = READ_ONCE(base_chain->stats);
if (pstats) {
local_bh_disable();
@@ -121,12 +180,12 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
u64_stats_update_end(&stats->syncp);
local_bh_enable();
}
- rcu_read_unlock();
}
struct nft_jumpstack {
- const struct nft_chain *chain;
- struct nft_rule *const *rules;
+ const struct nft_chain *chain;
+ const struct nft_rule_dp *rule;
+ const struct nft_rule_dp *last_rule;
};
static void expr_call_ops_eval(const struct nft_expr *expr,
@@ -141,6 +200,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
X(e, nft_payload_eval);
X(e, nft_cmp_eval);
+ X(e, nft_counter_eval);
X(e, nft_meta_get_eval);
X(e, nft_lookup_eval);
X(e, nft_range_eval);
@@ -154,18 +214,28 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
expr->ops->eval(expr, regs, pkt);
}
+#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0]
+#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size
+#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen]
+#define nft_rule_next(rule) (void *)rule + sizeof(*rule) + rule->dlen
+
+#define nft_rule_dp_for_each_expr(expr, last, rule) \
+ for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
+ (expr) != (last); \
+ (expr) = nft_rule_expr_next(expr))
+
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
+ const struct nft_rule_dp *rule, *last_rule;
const struct net *net = nft_net(pkt);
- struct nft_rule *const *rules;
- const struct nft_rule *rule;
const struct nft_expr *expr, *last;
- struct nft_regs regs;
+ struct nft_regs regs = {};
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
bool genbit = READ_ONCE(net->nft.gencursor);
+ struct nft_rule_blob *blob;
struct nft_traceinfo info;
info.trace = false;
@@ -173,18 +243,20 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
nft_trace_init(&info, pkt, &regs.verdict, basechain);
do_chain:
if (genbit)
- rules = rcu_dereference(chain->rules_gen_1);
+ blob = rcu_dereference(chain->blob_gen_1);
else
- rules = rcu_dereference(chain->rules_gen_0);
+ blob = rcu_dereference(chain->blob_gen_0);
+ rule = (struct nft_rule_dp *)blob->data;
+ last_rule = (void *)blob->data + blob->size;
next_rule:
- rule = *rules;
regs.verdict.code = NFT_CONTINUE;
- for (; *rules ; rules++) {
- rule = *rules;
- nft_rule_for_each_expr(expr, last, rule) {
+ for (; rule < last_rule; rule = nft_rule_next(rule)) {
+ nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_cmp16_fast_ops)
+ nft_cmp16_fast_eval(expr, &regs);
else if (expr->ops == &nft_bitwise_fast_ops)
nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
@@ -198,22 +270,23 @@ next_rule:
switch (regs.verdict.code) {
case NFT_BREAK:
regs.verdict.code = NFT_CONTINUE;
+ nft_trace_copy_nftrace(pkt, &info);
continue;
case NFT_CONTINUE:
- nft_trace_packet(&info, chain, rule,
+ nft_trace_packet(pkt, &info, chain, rule,
NFT_TRACETYPE_RULE);
continue;
}
break;
}
+ nft_trace_verdict(&info, chain, rule, &regs);
+
switch (regs.verdict.code & NF_VERDICT_MASK) {
case NF_ACCEPT:
case NF_DROP:
case NF_QUEUE:
case NF_STOLEN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
return regs.verdict.code;
}
@@ -222,32 +295,29 @@ next_rule:
if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
return NF_DROP;
jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rules = rules + 1;
+ jumpstack[stackptr].rule = nft_rule_next(rule);
+ jumpstack[stackptr].last_rule = last_rule;
stackptr++;
fallthrough;
case NFT_GOTO:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
-
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE:
case NFT_RETURN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RETURN);
break;
default:
- WARN_ON(1);
+ WARN_ON_ONCE(1);
}
if (stackptr > 0) {
stackptr--;
chain = jumpstack[stackptr].chain;
- rules = jumpstack[stackptr].rules;
+ rule = jumpstack[stackptr].rule;
+ last_rule = jumpstack[stackptr].last_rule;
goto next_rule;
}
- nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
+ nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
@@ -269,18 +339,22 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_rt_type,
&nft_exthdr_type,
&nft_last_type,
+ &nft_counter_type,
};
static struct nft_object_type *nft_basic_objects[] = {
#ifdef CONFIG_NETWORK_SECMARK
&nft_secmark_obj_type,
#endif
+ &nft_counter_obj_type,
};
int __init nf_tables_core_module_init(void)
{
int err, i, j = 0;
+ nft_counter_init_seqcount();
+
for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
err = nft_register_obj(nft_basic_objects[i]);
if (err)
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 9656c1646222..910ef881c3b8 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -94,7 +94,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
expr = nft_expr_first(rule);
while (nft_expr_more(rule, expr)) {
- if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
+ if (expr->ops->offload_action &&
+ expr->ops->offload_action(expr))
num_actions++;
expr = nft_expr_next(expr);
@@ -207,7 +208,7 @@ static int nft_setup_cb_call(enum tc_setup_type type, void *type_data,
return 0;
}
-int nft_chain_offload_priority(struct nft_base_chain *basechain)
+static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
{
if (basechain->ops.priority <= 0 ||
basechain->ops.priority > USHRT_MAX)
@@ -216,6 +217,27 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain)
return 0;
}
+bool nft_chain_offload_support(const struct nft_base_chain *basechain)
+{
+ struct net_device *dev;
+ struct nft_hook *hook;
+
+ if (nft_chain_offload_priority(basechain) < 0)
+ return false;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (hook->ops.pf != NFPROTO_NETDEV ||
+ hook->ops.hooknum != NF_NETDEV_INGRESS)
+ return false;
+
+ dev = hook->ops.dev;
+ if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists())
+ return false;
+ }
+
+ return true;
+}
+
static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
const struct nft_base_chain *basechain,
const struct nft_rule *rule,
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 84a7dea46efa..1163ba9c1401 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -7,7 +7,7 @@
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/hash.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/skbuff.h>
@@ -25,22 +25,6 @@
DEFINE_STATIC_KEY_FALSE(nft_trace_enabled);
EXPORT_SYMBOL_GPL(nft_trace_enabled);
-static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb)
-{
- __be32 id;
-
- /* using skb address as ID results in a limited number of
- * values (and quick reuse).
- *
- * So we attempt to use as many skb members that will not
- * change while skb is with netfilter.
- */
- id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb),
- skb->skb_iif);
-
- return nla_put_be32(nlskb, NFTA_TRACE_ID, id);
-}
-
static int trace_fill_header(struct sk_buff *nlskb, u16 type,
const struct sk_buff *skb,
int off, unsigned int len)
@@ -142,7 +126,7 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
const struct nft_traceinfo *info)
{
- if (!info->rule)
+ if (!info->rule || info->rule->is_last)
return 0;
/* a continue verdict with ->type == RETURN means that this is
@@ -186,6 +170,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
struct nlmsghdr *nlh;
struct sk_buff *skb;
unsigned int size;
+ u32 mark = 0;
u16 event;
if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
@@ -229,7 +214,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
goto nla_put_failure;
- if (trace_fill_id(skb, pkt->skb))
+ if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
@@ -249,16 +234,24 @@ void nft_trace_notify(struct nft_traceinfo *info)
case NFT_TRACETYPE_RULE:
if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
goto nla_put_failure;
+
+ /* pkt->skb undefined iff NF_STOLEN, disable dump */
+ if (info->verdict->code == NF_STOLEN)
+ info->packet_dumped = true;
+ else
+ mark = pkt->skb->mark;
+
break;
case NFT_TRACETYPE_POLICY:
+ mark = pkt->skb->mark;
+
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
- if (pkt->skb->mark &&
- nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark)))
+ if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark)))
goto nla_put_failure;
if (!info->packet_dumped) {
@@ -283,9 +276,20 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
const struct nft_verdict *verdict,
const struct nft_chain *chain)
{
+ static siphash_key_t trace_key __read_mostly;
+ struct sk_buff *skb = pkt->skb;
+
info->basechain = nft_base_chain(chain);
info->trace = true;
+ info->nf_trace = pkt->skb->nf_trace;
info->packet_dumped = false;
info->pkt = pkt;
info->verdict = verdict;
+
+ net_get_random_once(&trace_key, sizeof(trace_key));
+
+ info->skbid = (u32)siphash_3u32(hash32_ptr(skb),
+ skb_get_hash(skb),
+ skb->skb_iif,
+ &trace_key);
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 7e2c8dd01408..6d18fb346868 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -44,6 +44,10 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket");
static unsigned int nfnetlink_pernet_id __read_mostly;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static DEFINE_SPINLOCK(nfnl_grp_active_lock);
+#endif
+
struct nfnl_net {
struct sock *nfnl;
};
@@ -290,6 +294,7 @@ replay:
nfnl_lock(subsys_id);
if (nfnl_dereference_protected(subsys_id) != ss ||
nfnetlink_find_client(type, ss) != nc) {
+ nfnl_unlock(subsys_id);
err = -EAGAIN;
break;
}
@@ -626,7 +631,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
/* Work around old nft using host byte order */
- if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES)
res_id = NFNL_SUBSYS_NFTABLES;
else
res_id = ntohs(nfgenmsg->res_id);
@@ -654,7 +659,44 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
-#ifdef CONFIG_MODULES
+static void nfnetlink_bind_event(struct net *net, unsigned int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+ u8 v;
+
+ /* All NFNLGRP_CONNTRACK_* group bits fit into u8.
+ * The other groups are not relevant and can be ignored.
+ */
+ if (group >= 8)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ v = READ_ONCE(net->ct.ctnetlink_has_listener);
+ if ((v & group_bit) == 0) {
+ v |= group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+ }
+
+ spin_unlock(&nfnl_grp_active_lock);
+#endif
+}
+
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
@@ -670,9 +712,48 @@ static int nfnetlink_bind(struct net *net, int group)
rcu_read_unlock();
if (!ss)
request_module_nowait("nfnetlink-subsys-%d", type);
+
+ nfnetlink_bind_event(net, group);
return 0;
}
+
+static void nfnetlink_unbind(struct net *net, int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+
+ if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ /* ctnetlink_has_listener is u8 */
+ if (group >= 8)
+ return;
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ if (!nfnetlink_has_listeners(net, group)) {
+ u8 v = READ_ONCE(net->ct.ctnetlink_has_listener);
+
+ v &= ~group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(net->ct.ctnetlink_has_listener, v);
+ }
+ spin_unlock(&nfnl_grp_active_lock);
#endif
+}
static int __net_init nfnetlink_net_init(struct net *net)
{
@@ -680,9 +761,8 @@ static int __net_init nfnetlink_net_init(struct net *net)
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
-#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
-#endif
+ .unbind = nfnetlink_unbind,
};
nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 5c622f55c9d6..97248963a7d3 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -96,11 +96,13 @@ static int
nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
{
struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
if (attr == NULL)
return -EINVAL;
- if (help->helper->data_len == 0)
+ helper = rcu_dereference(help->helper);
+ if (!helper || helper->data_len == 0)
return -EINVAL;
nla_memcpy(help->data, attr, sizeof(help->data));
@@ -111,9 +113,11 @@ static int
nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
{
const struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
- if (help->helper->data_len &&
- nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data))
goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index c57673d499be..f466af4f8531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -33,8 +33,20 @@
static unsigned int nfct_timeout_id __read_mostly;
+struct ctnl_timeout {
+ struct list_head head;
+ struct list_head free_head;
+ struct rcu_head rcu_head;
+ refcount_t refcnt;
+ char name[CTNL_TIMEOUT_NAME_MAX];
+
+ /* must be at the end */
+ struct nf_ct_timeout timeout;
+};
+
struct nfct_timeout_pernet {
struct list_head nfct_timeout_list;
+ struct list_head nfct_timeout_freelist;
};
MODULE_LICENSE("GPL");
@@ -158,6 +170,7 @@ static int cttimeout_new_timeout(struct sk_buff *skb,
timeout->timeout.l3num = l3num;
timeout->timeout.l4proto = l4proto;
refcount_set(&timeout->refcnt, 1);
+ __module_get(THIS_MODULE);
list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list);
return 0;
@@ -506,13 +519,8 @@ static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (!try_module_get(THIS_MODULE))
- goto err;
-
- if (!refcount_inc_not_zero(&timeout->refcnt)) {
- module_put(THIS_MODULE);
+ if (!refcount_inc_not_zero(&timeout->refcnt))
goto err;
- }
matching = timeout;
break;
}
@@ -525,10 +533,10 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t)
struct ctnl_timeout *timeout =
container_of(t, struct ctnl_timeout, timeout);
- if (refcount_dec_and_test(&timeout->refcnt))
+ if (refcount_dec_and_test(&timeout->refcnt)) {
kfree_rcu(timeout, rcu_head);
-
- module_put(THIS_MODULE);
+ module_put(THIS_MODULE);
+ }
}
static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
@@ -578,20 +586,36 @@ static int __net_init cttimeout_net_init(struct net *net)
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
INIT_LIST_HEAD(&pernet->nfct_timeout_list);
+ INIT_LIST_HEAD(&pernet->nfct_timeout_freelist);
return 0;
}
+static void __net_exit cttimeout_net_pre_exit(struct net *net)
+{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+ struct ctnl_timeout *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
+ list_del_rcu(&cur->head);
+ list_add(&cur->free_head, &pernet->nfct_timeout_freelist);
+ }
+
+ /* core calls synchronize_rcu() after this */
+}
+
static void __net_exit cttimeout_net_exit(struct net *net)
{
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
- nf_ct_unconfirmed_destroy(net);
+ if (list_empty(&pernet->nfct_timeout_freelist))
+ return;
+
nf_ct_untimeout(net, NULL);
- list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
- list_del_rcu(&cur->head);
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) {
+ list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
@@ -600,11 +624,17 @@ static void __net_exit cttimeout_net_exit(struct net *net)
static struct pernet_operations cttimeout_ops = {
.init = cttimeout_net_init,
+ .pre_exit = cttimeout_net_pre_exit,
.exit = cttimeout_net_exit,
.id = &nfct_timeout_id,
.size = sizeof(struct nfct_timeout_pernet),
};
+static const struct nf_ct_timeout_hooks hooks = {
+ .timeout_find_get = ctnl_timeout_find_get,
+ .timeout_put = ctnl_timeout_put,
+};
+
static int __init cttimeout_init(void)
{
int ret;
@@ -619,8 +649,7 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+ RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks);
return 0;
err_out:
@@ -628,14 +657,24 @@ err_out:
return ret;
}
+static int untimeout(struct nf_conn *ct, void *timeout)
+{
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+
+ return 0;
+}
+
static void __exit cttimeout_exit(void)
{
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
+
+ nf_ct_iterate_destroy(untimeout, NULL);
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index d5c719c9e36c..8120aadf6a0f 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -6,6 +6,7 @@
*/
#include <linux/module.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/skbuff.h>
@@ -214,13 +215,6 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de
hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
break;
-#if IS_ENABLED(CONFIG_DECNET)
- case NFPROTO_DECNET:
- if (hook >= ARRAY_SIZE(net->nf.hooks_decnet))
- return ERR_PTR(-EINVAL);
- hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
- break;
-#endif
#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
case NFPROTO_NETDEV:
if (hook >= NF_NETDEV_NUMHOOKS)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 691ef4cffdd9..d97eb280cb2e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -66,6 +66,7 @@ struct nfulnl_instance {
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
struct net *net;
+ netns_tracker ns_tracker;
struct user_namespace *peer_user_ns; /* User namespace of the peer process */
u32 peer_portid; /* PORTID of the peer process */
@@ -140,7 +141,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head)
struct nfulnl_instance *inst =
container_of(head, struct nfulnl_instance, rcu);
- put_net(inst->net);
+ put_net_track(inst->net, &inst->ns_tracker);
kfree(inst);
module_put(THIS_MODULE);
}
@@ -187,7 +188,7 @@ instance_create(struct net *net, u_int16_t group_num,
timer_setup(&inst->timer, nfulnl_timer, 0);
- inst->net = get_net(net);
+ inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC);
inst->peer_user_ns = user_ns;
inst->peer_portid = portid;
inst->group_num = group_num;
@@ -459,6 +460,7 @@ __build_packet_message(struct nfnl_log_net *log,
sk_buff_data_t old_tail = inst->skb->tail;
struct sock *sk;
const unsigned char *hwhdrp;
+ ktime_t tstamp;
nlh = nfnl_msg_put(inst->skb, 0, 0,
nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
@@ -556,7 +558,8 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
if (indev && skb->dev &&
- skb->mac_header != skb->network_header) {
+ skb_mac_header_was_set(skb) &&
+ skb_mac_header_len(skb) != 0) {
struct nfulnl_msg_packet_hw phw;
int len;
@@ -586,9 +589,10 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (hooknum <= NF_INET_FORWARD && skb->tstamp) {
+ tstamp = skb_tstamp_cond(skb, false);
+ if (hooknum <= NF_INET_FORWARD && tstamp) {
struct nfulnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 0fa2e2030427..ee6840bd5933 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb,
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
struct tcphdr _tcph;
+ bool found = false;
memset(&ctx, 0, sizeof(ctx));
@@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb,
data->genre = f->genre;
data->version = f->version;
+ found = true;
break;
}
- return true;
+ return found;
}
EXPORT_SYMBOL_GPL(nf_osf_find);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 5837e8efc9c2..87a9009d5234 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -225,7 +225,7 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
int err;
if (verdict == NF_ACCEPT ||
@@ -388,10 +388,11 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct net_device *outdev;
struct nf_conn *ct = NULL;
enum ip_conntrack_info ctinfo = 0;
- struct nfnl_ct_hook *nfnl_ct;
+ const struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
char *secdata = NULL;
u32 seclen = 0;
+ ktime_t tstamp;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
@@ -402,11 +403,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* priority */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
- if (entskb->tstamp)
+ tstamp = skb_tstamp_cond(entskb, false);
+ if (tstamp)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
size += nfqnl_get_bridge_size(entry);
@@ -559,8 +562,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
goto nla_put_failure;
+ if (entskb->priority &&
+ nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+ goto nla_put_failure;
+
if (indev && entskb->dev &&
- skb_mac_header_was_set(entskb)) {
+ skb_mac_header_was_set(entskb) &&
+ skb_mac_header_len(entskb) != 0) {
struct nfqnl_msg_packet_hw phw;
int len;
@@ -576,9 +584,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) {
+ if (entry->state.hook <= NF_INET_FORWARD && tstamp) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -709,9 +717,15 @@ static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry)
- nf_queue_entry_get_refs(entry);
- return entry;
+
+ if (!entry)
+ return NULL;
+
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+
+ kfree(entry);
+ return NULL;
}
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -829,11 +843,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
}
static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
struct sk_buff *nskb;
if (diff < 0) {
+ unsigned int min_len = skb_transport_offset(e->skb);
+
+ if (data_len < min_len)
+ return -EINVAL;
+
if (pskb_trim(e->skb, data_len))
return -ENOMEM;
} else if (diff > 0) {
@@ -1013,11 +1032,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
[NFQA_VLAN] = { .type = NLA_NESTED },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static struct nfqnl_instance *
@@ -1098,12 +1119,15 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
}
return 0;
}
-static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
+static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
const struct nlmsghdr *nlh,
const struct nlattr * const nfqa[],
struct nf_queue_entry *entry,
@@ -1170,11 +1194,11 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
{
struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
u_int16_t queue_num = ntohs(info->nfmsg->res_id);
+ const struct nfnl_ct_hook *nfnl_ct;
struct nfqnl_msg_verdict_hdr *vhdr;
enum ip_conntrack_info ctinfo;
struct nfqnl_instance *queue;
struct nf_queue_entry *entry;
- struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
unsigned int verdict;
int err;
@@ -1224,6 +1248,9 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
return 0;
}
@@ -1527,15 +1554,9 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}
-static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
-{
- synchronize_rcu();
-}
-
static struct pernet_operations nfnl_queue_net_ops = {
.init = nfnl_queue_net_init,
.exit = nfnl_queue_net_exit,
- .exit_batch = nfnl_queue_net_exit_batch,
.id = &nfnl_queue_net_id,
.size = sizeof(struct nfnl_queue_net),
};
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 47b0dba95054..e6e402b247d0 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -30,7 +30,7 @@ static void nft_bitwise_eval_bool(u32 *dst, const u32 *src,
{
unsigned int i;
- for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
+ for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++)
dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
}
@@ -93,7 +93,16 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
static int nft_bitwise_init_bool(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc mask, xor;
+ struct nft_data_desc mask = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->mask),
+ .len = priv->len,
+ };
+ struct nft_data_desc xor = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->xor),
+ .len = priv->len,
+ };
int err;
if (tb[NFTA_BITWISE_DATA])
@@ -103,36 +112,30 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv,
!tb[NFTA_BITWISE_XOR])
return -EINVAL;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask,
- tb[NFTA_BITWISE_MASK]);
+ err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) {
- err = -EINVAL;
- goto err1;
- }
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor,
- tb[NFTA_BITWISE_XOR]);
+ err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]);
if (err < 0)
- goto err1;
- if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) {
- err = -EINVAL;
- goto err2;
- }
+ goto err_xor_err;
return 0;
-err2:
- nft_data_release(&priv->xor, xor.type);
-err1:
+
+err_xor_err:
nft_data_release(&priv->mask, mask.type);
+
return err;
}
static int nft_bitwise_init_shift(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc d;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = sizeof(u32),
+ };
int err;
if (tb[NFTA_BITWISE_MASK] ||
@@ -142,13 +145,12 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv,
if (!tb[NFTA_BITWISE_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d,
- tb[NFTA_BITWISE_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]);
if (err < 0)
return err;
- if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) ||
- priv->data.data[0] >= BITS_PER_TYPE(u32)) {
- nft_data_release(&priv->data, d.type);
+
+ if (priv->data.data[0] >= BITS_PER_TYPE(u32)) {
+ nft_data_release(&priv->data, desc.type);
return -EINVAL;
}
@@ -278,34 +280,81 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_bitwise_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const struct nft_bitwise *bitwise;
+ unsigned int regcount;
+ u8 dreg;
+ int i;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->sreg].num_reg == 0 &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->dreg == bitwise->dreg &&
+ priv->op == bitwise->op &&
+ priv->len == bitwise->len &&
+ !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) &&
+ !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) &&
+ !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise ||
+ track->regs[priv->sreg].num_reg != 0) {
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ nft_reg_track_update(track, track->regs[priv->sreg].selector,
+ priv->dreg, priv->len);
+ }
+
+ dreg = priv->dreg;
+ regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ track->regs[priv->dreg].bitwise = expr;
+
+ return false;
+}
+
static const struct nft_expr_ops nft_bitwise_ops = {
.type = &nft_bitwise_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
.eval = nft_bitwise_eval,
.init = nft_bitwise_init,
.dump = nft_bitwise_dump,
+ .reduce = nft_bitwise_reduce,
.offload = nft_bitwise_offload,
};
static int
nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out)
{
- struct nft_data_desc desc;
struct nft_data data;
- int err = 0;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ .len = sizeof(u32),
+ };
+ int err;
- err = nft_data_init(NULL, &data, sizeof(data), &desc, tb);
+ err = nft_data_init(NULL, &data, &desc, tb);
if (err < 0)
return err;
- if (desc.type != NFT_DATA_VALUE || desc.len != sizeof(u32)) {
- err = -EINVAL;
- goto err;
- }
*out = data.data[0];
-err:
- nft_data_release(&data, desc.type);
- return err;
+
+ return 0;
}
static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
@@ -385,12 +434,48 @@ static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_bitwise_fast_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ const struct nft_bitwise_fast_expr *bitwise;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->dreg == bitwise->dreg &&
+ priv->mask == bitwise->mask &&
+ priv->xor == bitwise->xor) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise) {
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ track->regs[priv->dreg].selector =
+ track->regs[priv->sreg].selector;
+ }
+ track->regs[priv->dreg].bitwise = expr;
+
+ return false;
+}
+
const struct nft_expr_ops nft_bitwise_fast_ops = {
.type = &nft_bitwise_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)),
.eval = NULL, /* inlined */
.init = nft_bitwise_fast_init,
.dump = nft_bitwise_fast_dump,
+ .reduce = nft_bitwise_fast_reduce,
.offload = nft_bitwise_fast_offload,
};
@@ -427,3 +512,22 @@ struct nft_expr_type nft_bitwise_type __read_mostly = {
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
};
+
+bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_expr *last = track->last;
+ const struct nft_expr *next;
+
+ if (expr == last)
+ return false;
+
+ next = nft_expr_next(expr);
+ if (next->ops == &nft_bitwise_ops)
+ return nft_bitwise_reduce(track, next);
+ else if (next->ops == &nft_bitwise_fast_ops)
+ return nft_bitwise_fast_reduce(track, next);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_expr_reduce_bitwise);
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 9d5947ab8d4e..f952a80275a8 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -44,7 +44,8 @@ void nft_byteorder_eval(const struct nft_expr *expr,
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = nft_reg_load64(&src[i]);
- nft_reg_store64(&dst[i], be64_to_cpu(src64));
+ nft_reg_store64(&dst[i],
+ be64_to_cpu((__force __be64)src64));
}
break;
case NFT_BYTEORDER_HTON:
@@ -167,12 +168,23 @@ nla_put_failure:
return -1;
}
+static bool nft_byteorder_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_byteorder_ops = {
.type = &nft_byteorder_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
.eval = nft_byteorder_eval,
.init = nft_byteorder_init,
.dump = nft_byteorder_dump,
+ .reduce = nft_byteorder_reduce,
};
struct nft_expr_type nft_byteorder_type __read_mostly = {
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 47b6d05f1ae6..963cf831799c 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -73,20 +73,16 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
int err;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- if (desc.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- nft_data_release(&priv->data, desc.type);
- return err;
- }
-
err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
@@ -125,13 +121,13 @@ static void nft_payload_n2h(union nft_cmp_offload_data *data,
{
switch (len) {
case 2:
- data->val16 = ntohs(*((u16 *)val));
+ data->val16 = ntohs(*((__be16 *)val));
break;
case 4:
- data->val32 = ntohl(*((u32 *)val));
+ data->val32 = ntohl(*((__be32 *)val));
break;
case 8:
- data->val64 = be64_to_cpu(*((u64 *)val));
+ data->val64 = be64_to_cpu(*((__be64 *)val));
break;
default:
WARN_ON_ONCE(1);
@@ -193,20 +189,35 @@ static const struct nft_expr_ops nft_cmp_ops = {
.eval = nft_cmp_eval,
.init = nft_cmp_init,
.dump = nft_cmp_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_cmp_offload,
};
+/* Calculate the mask for the nft_cmp_fast expression. On big endian the
+ * mask needs to include the *upper* bytes when interpreting that data as
+ * something smaller than the full u32, therefore a cpu_to_le32 is done.
+ */
+static u32 nft_cmp_fast_mask(unsigned int len)
+{
+ __le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr,
+ data) * BITS_PER_BYTE - len));
+
+ return (__force u32)mask;
+}
+
static int nft_cmp_fast_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
int err;
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
@@ -269,15 +280,113 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.eval = NULL, /* inlined */
.init = nft_cmp_fast_init,
.dump = nft_cmp_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_cmp_fast_offload,
};
+static u32 nft_cmp_mask(u32 bitlen)
+{
+ return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen));
+}
+
+static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen)
+{
+ int len = bitlen / BITS_PER_BYTE;
+ int i, words = len / sizeof(u32);
+
+ for (i = 0; i < words; i++) {
+ data->data[i] = 0xffffffff;
+ bitlen -= sizeof(u32) * BITS_PER_BYTE;
+ }
+
+ if (len % sizeof(u32))
+ data->data[i++] = nft_cmp_mask(bitlen);
+
+ for (; i < 4; i++)
+ data->data[i] = 0;
+}
+
+static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE);
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
+ priv->len = desc.len;
+
+ return 0;
+}
+
+static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = priv->data,
+ .sreg = priv->sreg,
+ .len = priv->len,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+
+const struct nft_expr_ops nft_cmp16_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp16_fast_init,
+ .dump = nft_cmp16_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_cmp16_fast_offload,
+};
+
static const struct nft_expr_ops *
nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
- struct nft_data_desc desc;
struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
enum nft_cmp_ops op;
+ u8 sreg;
int err;
if (tb[NFTA_CMP_SREG] == NULL ||
@@ -298,21 +407,21 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return ERR_PTR(-EINVAL);
}
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return ERR_PTR(err);
- if (desc.type != NFT_DATA_VALUE)
- goto err1;
-
- if (desc.len <= sizeof(u32) && (op == NFT_CMP_EQ || op == NFT_CMP_NEQ))
- return &nft_cmp_fast_ops;
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
+ if (desc.len <= sizeof(u32))
+ return &nft_cmp_fast_ops;
+ else if (desc.len <= sizeof(data) &&
+ ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) ||
+ (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0)))
+ return &nft_cmp16_fast_ops;
+ }
return &nft_cmp_ops;
-err1:
- nft_data_release(&data, desc.type);
- return ERR_PTR(-EINVAL);
}
struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index f69cc73c5813..c16172427622 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -731,6 +731,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
static struct nft_expr_type nft_match_type;
+static bool nft_match_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct xt_match *match = expr->ops->data;
+
+ return strcmp(match->name, "comment") == 0;
+}
+
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -773,6 +781,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_match_dump;
ops->validate = nft_match_validate;
ops->data = match;
+ ops->reduce = nft_match_reduce;
matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
if (matchsize > NFT_MATCH_LARGE_THRESH) {
@@ -862,6 +871,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_target_dump;
ops->validate = nft_target_validate;
ops->data = target;
+ ops->reduce = NFT_REDUCE_READONLY;
if (family == NFPROTO_BRIDGE)
ops->eval = nft_target_eval_bridge;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 7d0761fad37e..d657f999a11b 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,7 +14,7 @@
#include <net/netfilter/nf_conntrack_zones.h>
struct nft_connlimit {
- struct nf_conncount_list list;
+ struct nf_conncount_list *list;
u32 limit;
bool invert;
};
@@ -43,12 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
return;
}
- if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) {
+ if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) {
regs->verdict.code = NF_DROP;
return;
}
- count = priv->list.count;
+ count = priv->list->count;
if ((count > priv->limit) ^ priv->invert) {
regs->verdict.code = NFT_BREAK;
@@ -62,6 +62,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
{
bool invert = false;
u32 flags, limit;
+ int err;
if (!tb[NFTA_CONNLIMIT_COUNT])
return -EINVAL;
@@ -76,18 +77,31 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- nf_conncount_list_init(&priv->list);
+ priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
+ if (!priv->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv->list);
priv->limit = limit;
priv->invert = invert;
- return nf_ct_netns_get(ctx->net, ctx->family);
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err < 0)
+ goto err_netns;
+
+ return 0;
+err_netns:
+ kfree(priv->list);
+
+ return err;
}
static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
struct nft_connlimit *priv)
{
nf_ct_netns_put(ctx->net, ctx->family);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -200,7 +214,11 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- nf_conncount_list_init(&priv_dst->list);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
+ if (!priv_dst->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv_dst->list);
priv_dst->limit = priv_src->limit;
priv_dst->invert = priv_src->invert;
@@ -212,7 +230,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
@@ -221,7 +240,7 @@ static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
bool ret;
local_bh_disable();
- ret = nf_conncount_gc_list(net, &priv->list);
+ ret = nf_conncount_gc_list(net, priv->list);
local_bh_enable();
return ret;
@@ -238,6 +257,7 @@ static const struct nft_expr_ops nft_connlimit_ops = {
.destroy_clone = nft_connlimit_destroy_clone,
.dump = nft_connlimit_dump,
.gc = nft_connlimit_gc,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_connlimit_type __read_mostly = {
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 8edd3b3c173d..f4d3573e8782 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -13,6 +13,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables_offload.h>
struct nft_counter {
@@ -61,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- cpu_stats = alloc_percpu(struct nft_counter);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
@@ -174,7 +175,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
-static struct nft_object_type nft_counter_obj_type;
+struct nft_object_type nft_counter_obj_type;
static const struct nft_object_ops nft_counter_obj_ops = {
.type = &nft_counter_obj_type,
.size = sizeof(struct nft_counter_percpu_priv),
@@ -184,7 +185,7 @@ static const struct nft_object_ops nft_counter_obj_ops = {
.dump = nft_counter_obj_dump,
};
-static struct nft_object_type nft_counter_obj_type __read_mostly = {
+struct nft_object_type nft_counter_obj_type __read_mostly = {
.type = NFT_OBJECT_COUNTER,
.ops = &nft_counter_obj_ops,
.maxattr = NFTA_COUNTER_MAX,
@@ -192,9 +193,8 @@ static struct nft_object_type nft_counter_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
@@ -275,7 +275,15 @@ static void nft_counter_offload_stats(struct nft_expr *expr,
preempt_enable();
}
-static struct nft_expr_type nft_counter_type;
+void nft_counter_init_seqcount(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
+}
+
+struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)),
@@ -285,11 +293,12 @@ static const struct nft_expr_ops nft_counter_ops = {
.destroy_clone = nft_counter_destroy,
.dump = nft_counter_dump,
.clone = nft_counter_clone,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_counter_offload,
.offload_stats = nft_counter_offload_stats,
};
-static struct nft_expr_type nft_counter_type __read_mostly = {
+struct nft_expr_type nft_counter_type __read_mostly = {
.name = "counter",
.ops = &nft_counter_ops,
.policy = nft_counter_policy,
@@ -297,39 +306,3 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
.flags = NFT_EXPR_STATEFUL,
.owner = THIS_MODULE,
};
-
-static int __init nft_counter_module_init(void)
-{
- int cpu, err;
-
- for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
-
- err = nft_register_obj(&nft_counter_obj_type);
- if (err < 0)
- return err;
-
- err = nft_register_expr(&nft_counter_type);
- if (err < 0)
- goto err1;
-
- return 0;
-err1:
- nft_unregister_obj(&nft_counter_obj_type);
- return err;
-}
-
-static void __exit nft_counter_module_exit(void)
-{
- nft_unregister_expr(&nft_counter_type);
- nft_unregister_obj(&nft_counter_obj_type);
-}
-
-module_init(nft_counter_module_init);
-module_exit(nft_counter_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("counter");
-MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
-MODULE_DESCRIPTION("nftables counter rule support");
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 99b1de14ff7e..a3f01f209a53 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -26,6 +26,7 @@
struct nft_ct {
enum nft_ct_keys key:8;
enum ip_conntrack_dir dir:8;
+ u8 len;
union {
u8 dreg;
u8 sreg;
@@ -203,12 +204,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
case NFT_CT_SRC_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->src.u3.ip;
+ *dest = (__force __u32)tuple->src.u3.ip;
return;
case NFT_CT_DST_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->dst.u3.ip;
+ *dest = (__force __u32)tuple->dst.u3.ip;
return;
case NFT_CT_SRC_IP6:
if (nf_ct_l3num(ct) != NFPROTO_IPV6)
@@ -259,10 +260,13 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+ if (likely(refcount_read(&ct->ct_general.use) == 1)) {
+ refcount_inc(&ct->ct_general.use);
nf_ct_zone_add(ct, &zone);
} else {
- /* previous skb got queued to userspace */
+ /* previous skb got queued to userspace, allocate temporary
+ * one until percpu template can be reused.
+ */
ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
if (!ct) {
regs->verdict.code = NF_DROP;
@@ -270,7 +274,6 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
}
}
- atomic_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
}
#endif
@@ -375,7 +378,6 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
return false;
}
- atomic_set(&tmp->ct_general.use, 1);
per_cpu(nft_ct_pcpu_template, cpu) = tmp;
}
@@ -499,6 +501,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
}
}
+ priv->len = len;
err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL,
NFT_DATA_VALUE, len);
if (err < 0)
@@ -607,6 +610,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
}
}
+ priv->len = len;
err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len);
if (err < 0)
goto err1;
@@ -676,6 +680,29 @@ nla_put_failure:
return -1;
}
+static bool nft_ct_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ const struct nft_ct *ct;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ ct = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != ct->key) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -709,8 +736,27 @@ static const struct nft_expr_ops nft_ct_get_ops = {
.init = nft_ct_get_init,
.destroy = nft_ct_get_destroy,
.dump = nft_ct_get_dump,
+ .reduce = nft_ct_get_reduce,
};
+static bool nft_ct_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_ct_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_ct_set_ops = {
.type = &nft_ct_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
@@ -718,6 +764,7 @@ static const struct nft_expr_ops nft_ct_set_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -728,6 +775,7 @@ static const struct nft_expr_ops nft_ct_set_zone_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#endif
@@ -784,6 +832,7 @@ static const struct nft_expr_ops nft_notrack_ops = {
.type = &nft_notrack_type,
.size = NFT_EXPR_SIZE(0),
.eval = nft_notrack_eval,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_notrack_type __read_mostly = {
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index bbf3fcba3df4..63507402716d 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx,
return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif);
}
+static bool nft_dup_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
static struct nft_expr_type nft_dup_netdev_type;
static const struct nft_expr_ops nft_dup_netdev_ops = {
.type = &nft_dup_netdev_type,
@@ -74,7 +79,9 @@ static const struct nft_expr_ops nft_dup_netdev_ops = {
.eval = nft_dup_netdev_eval,
.init = nft_dup_netdev_init,
.dump = nft_dup_netdev_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_dup_netdev_offload,
+ .offload_action = nft_dup_netdev_offload_action,
};
static struct nft_expr_type nft_dup_netdev_type __read_mostly = {
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 87f3af4645d9..6983e6ddeef9 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -60,7 +60,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
&regs->data[priv->sreg_key], NULL,
&regs->data[priv->sreg_data],
timeout, 0, GFP_ATOMIC);
- if (elem == NULL)
+ if (IS_ERR(elem))
goto err1;
ext = nft_set_elem_ext(set, elem);
@@ -413,6 +413,7 @@ static const struct nft_expr_ops nft_dynset_ops = {
.activate = nft_dynset_activate,
.deactivate = nft_dynset_deactivate,
.dump = nft_dynset_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_dynset_type __read_mostly = {
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index dbe1f2e7dd9e..a67ea9c3ae57 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -167,7 +167,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
{
struct tcphdr *tcph;
- if (pkt->tprot != IPPROTO_TCP)
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff)
return NULL;
tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer);
@@ -266,7 +266,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
switch (priv->len) {
case 2:
- old.v16 = get_unaligned((u16 *)(opt + offset));
+ old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset));
new.v16 = (__force __be16)nft_reg_load16(
&regs->data[priv->sreg]);
@@ -281,18 +281,18 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (old.v16 == new.v16)
return;
- put_unaligned(new.v16, (u16*)(opt + offset));
+ put_unaligned(new.v16, (__be16*)(opt + offset));
inet_proto_csum_replace2(&tcph->check, pkt->skb,
old.v16, new.v16, false);
break;
case 4:
- new.v32 = regs->data[priv->sreg];
- old.v32 = get_unaligned((u32 *)(opt + offset));
+ new.v32 = nft_reg_load_be32(&regs->data[priv->sreg]);
+ old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
return;
- put_unaligned(new.v32, (u32*)(opt + offset));
+ put_unaligned(new.v32, (__be32*)(opt + offset));
inet_proto_csum_replace4(&tcph->check, pkt->skb,
old.v32, new.v32, false);
break;
@@ -308,6 +308,63 @@ err:
regs->verdict.code = NFT_BREAK;
}
+static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int i, tcphdr_len, optl;
+ struct tcphdr *tcph;
+ u8 *opt;
+
+ tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!tcph)
+ goto err;
+
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto drop;
+
+ opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!opt)
+ goto err;
+ for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+ unsigned int j;
+
+ optl = optlen(opt, i);
+ if (priv->type != opt[i])
+ continue;
+
+ if (i + optl > tcphdr_len)
+ goto drop;
+
+ for (j = 0; j < optl; ++j) {
+ u16 n = TCPOPT_NOP;
+ u16 o = opt[i+j];
+
+ if ((i + j) % 2 == 0) {
+ o <<= 8;
+ n <<= 8;
+ }
+ inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o),
+ htons(n), false);
+ }
+ memset(opt + i, TCPOPT_NOP, optl);
+ return;
+ }
+
+ /* option not found, continue. This allows to do multiple
+ * option removals per rule.
+ */
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+ return;
+drop:
+ /* can't remove, no choice but to drop */
+ regs->verdict.code = NF_DROP;
+}
+
static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -457,6 +514,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
priv->len);
}
+static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_EXTHDR_SREG] ||
+ tb[NFTA_EXTHDR_DREG] ||
+ tb[NFTA_EXTHDR_FLAGS] ||
+ tb[NFTA_EXTHDR_OFFSET] ||
+ tb[NFTA_EXTHDR_LEN])
+ return -EINVAL;
+
+ if (!tb[NFTA_EXTHDR_TYPE])
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->op = NFT_EXTHDR_OP_TCPOPT;
+
+ return 0;
+}
+
static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -517,12 +596,47 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
+static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ return nft_exthdr_dump_common(skb, priv);
+}
+
+static bool nft_exthdr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+ const struct nft_exthdr *exthdr;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ exthdr = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->type != exthdr->type ||
+ priv->op != exthdr->op ||
+ priv->flags != exthdr->flags ||
+ priv->offset != exthdr->offset ||
+ priv->len != exthdr->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
.eval = nft_exthdr_ipv6_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
@@ -531,6 +645,7 @@ static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
.eval = nft_exthdr_ipv4_eval,
.init = nft_exthdr_ipv4_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_ops = {
@@ -539,6 +654,7 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = {
.eval = nft_exthdr_tcp_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
@@ -547,6 +663,16 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
.eval = nft_exthdr_tcp_set_eval,
.init = nft_exthdr_tcp_set_init,
.dump = nft_exthdr_dump_set,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_tcp_strip_eval,
+ .init = nft_exthdr_tcp_strip_init,
+ .dump = nft_exthdr_dump_strip,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_exthdr_sctp_ops = {
@@ -555,6 +681,7 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = {
.eval = nft_exthdr_sctp_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops *
@@ -576,7 +703,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_tcp_set_ops;
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_tcp_ops;
- break;
+ return &nft_exthdr_tcp_strip_ops;
case NFT_EXTHDR_OP_IPV6:
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index b10ce732b337..1f12d7ade606 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -35,6 +35,10 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
hooks = (1 << NF_INET_PRE_ROUTING);
+ if (priv->flags & NFTA_FIB_F_IIF) {
+ hooks |= (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
+ }
break;
case NFT_FIB_RESULT_ADDRTYPE:
if (priv->flags & NFTA_FIB_F_IIF)
@@ -156,5 +160,47 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
}
EXPORT_SYMBOL_GPL(nft_fib_store_result);
+bool nft_fib_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int len = NFT_REG32_SIZE;
+ const struct nft_fib *fib;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ len = NFT_REG32_SIZE;
+ else
+ len = IFNAMSIZ;
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ fib = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->result != fib->result ||
+ priv->flags != fib->flags) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_fib_reduce);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
index a88d44e163d1..666a3741d20b 100644
--- a/net/netfilter/nft_fib_inet.c
+++ b/net/netfilter/nft_fib_inet.c
@@ -49,6 +49,7 @@ static const struct nft_expr_ops nft_fib_inet_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_inet_type __read_mostly = {
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index 3f3478abd845..9121ec64e918 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -58,6 +58,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_netdev_type __read_mostly = {
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 0af34ad41479..a25c88bc8b75 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -36,6 +36,15 @@ static void nft_default_forward_path(struct nf_flow_route *route,
route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
}
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
const struct dst_entry *dst_cache,
const struct nf_conn *ct,
@@ -47,6 +56,9 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
struct neighbour *n;
u8 nud_state;
+ if (!nft_is_valid_ether_device(dev))
+ goto out;
+
n = dst_neigh_lookup(dst_cache, daddr);
if (!n)
return -1;
@@ -60,6 +72,7 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
if (!(nud_state & NUD_VALID))
return -1;
+out:
return dev_fill_forward_path(dev, ha, stack);
}
@@ -78,15 +91,6 @@ struct nft_forward_info {
enum flow_offload_xmit_type xmit_type;
};
-static bool nft_is_valid_ether_device(const struct net_device *dev)
-{
- if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
- dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
- return false;
-
- return true;
-}
-
static void nft_dev_path_info(const struct net_device_path_stack *stack,
struct nft_forward_info *info,
unsigned char *ha, struct nf_flowtable *flowtable)
@@ -119,7 +123,8 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
info->indev = NULL;
break;
}
- info->outdev = path->dev;
+ if (!info->outdev)
+ info->outdev = path->dev;
info->encap[info->num_encaps].id = path->encap.id;
info->encap[info->num_encaps].proto = path->encap.proto;
info->num_encaps++;
@@ -227,11 +232,21 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+ fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos);
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
break;
case NFPROTO_IPV6:
fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
break;
}
@@ -293,11 +308,25 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
case IPPROTO_TCP:
tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt),
sizeof(_tcph), &_tcph);
- if (unlikely(!tcph || tcph->fin || tcph->rst))
+ if (unlikely(!tcph || tcph->fin || tcph->rst ||
+ !nf_conntrack_tcp_established(ct)))
goto out;
break;
case IPPROTO_UDP:
break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: {
+ struct nf_conntrack_tuple *tuple;
+
+ if (ct->status & IPS_NAT_MASK)
+ goto out;
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ /* No support for GRE v1 */
+ if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
+ goto out;
+ break;
+ }
+#endif
default:
goto out;
}
@@ -428,6 +457,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = {
.destroy = nft_flow_offload_destroy,
.validate = nft_flow_offload_validate,
.dump = nft_flow_offload_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_flow_offload_type __read_mostly = {
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index cd59afde5b2f..7c5876dc9ff2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -27,9 +27,11 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
+ struct sk_buff *skb = pkt->skb;
/* This is used by ifb only. */
- skb_set_redirected(pkt->skb, true);
+ skb->skb_iif = skb->dev->ifindex;
+ skb_set_redirected(skb, nft_hook(pkt) == NF_NETDEV_INGRESS);
nf_fwd_netdev_egress(pkt, oif);
regs->verdict.code = NF_STOLEN;
@@ -77,6 +79,11 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx,
return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif);
}
+static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
struct nft_fwd_neigh {
u8 sreg_dev;
u8 sreg_addr;
@@ -138,7 +145,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
return;
skb->dev = dev;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
neigh_xmit(neigh_table, dev, addr, skb);
out:
regs->verdict.code = verdict;
@@ -198,7 +205,8 @@ static int nft_fwd_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data)
{
- return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS));
}
static struct nft_expr_type nft_fwd_netdev_type;
@@ -209,6 +217,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.init = nft_fwd_neigh_init,
.dump = nft_fwd_neigh_dump,
.validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_fwd_netdev_ops = {
@@ -218,7 +227,9 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
.validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_fwd_netdev_offload,
+ .offload_action = nft_fwd_netdev_offload_action,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index f829f5289e16..e5631e88b285 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -165,6 +165,16 @@ nla_put_failure:
return -1;
}
+static bool nft_jhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_jhash *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, sizeof(u32));
+
+ return false;
+}
+
static int nft_symhash_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
@@ -185,6 +195,30 @@ nla_put_failure:
return -1;
}
+static bool nft_symhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_symhash *priv = nft_expr_priv(expr);
+ struct nft_symhash *symhash;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ symhash = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->offset != symhash->offset ||
+ priv->modulus != symhash->modulus) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_hash_type;
static const struct nft_expr_ops nft_jhash_ops = {
.type = &nft_hash_type,
@@ -192,6 +226,7 @@ static const struct nft_expr_ops nft_jhash_ops = {
.eval = nft_jhash_eval,
.init = nft_jhash_init,
.dump = nft_jhash_dump,
+ .reduce = nft_jhash_reduce,
};
static const struct nft_expr_ops nft_symhash_ops = {
@@ -200,6 +235,7 @@ static const struct nft_expr_ops nft_symhash_ops = {
.eval = nft_symhash_eval,
.init = nft_symhash_init,
.dump = nft_symhash_dump,
+ .reduce = nft_symhash_reduce,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 90c64d27ae53..5f28b21abc7d 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -29,20 +29,36 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
[NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED },
};
+static enum nft_data_types nft_reg_to_type(const struct nlattr *nla)
+{
+ enum nft_data_types type;
+ u8 reg;
+
+ reg = ntohl(nla_get_be32(nla));
+ if (reg == NFT_REG_VERDICT)
+ type = NFT_DATA_VERDICT;
+ else
+ type = NFT_DATA_VALUE;
+
+ return type;
+}
+
static int nft_immediate_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_immediate_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .size = sizeof(priv->data),
+ };
int err;
if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
tb[NFTA_IMMEDIATE_DATA] == NULL)
return -EINVAL;
- err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_IMMEDIATE_DATA]);
+ desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]);
+ err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
if (err < 0)
return err;
@@ -213,6 +229,27 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_immediate_offload_action(const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ return true;
+
+ return false;
+}
+
+static bool nft_immediate_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ nft_reg_track_cancel(track, priv->dreg, priv->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -223,8 +260,9 @@ static const struct nft_expr_ops nft_imm_ops = {
.destroy = nft_immediate_destroy,
.dump = nft_immediate_dump,
.validate = nft_immediate_validate,
+ .reduce = nft_immediate_reduce,
.offload = nft_immediate_offload,
- .offload_flags = NFT_OFFLOAD_F_ACTION,
+ .offload_action = nft_immediate_offload_action,
};
struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 304e33cbed9b..bb15a55dad5c 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -8,9 +8,13 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+struct nft_last {
+ unsigned long jiffies;
+ unsigned int set;
+};
+
struct nft_last_priv {
- unsigned long last_jiffies;
- unsigned int last_set;
+ struct nft_last *last;
};
static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = {
@@ -22,47 +26,55 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last;
u64 last_jiffies;
- u32 last_set = 0;
int err;
- if (tb[NFTA_LAST_SET]) {
- last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
- if (last_set == 1)
- priv->last_set = 1;
- }
+ last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
+ if (!last)
+ return -ENOMEM;
+
+ if (tb[NFTA_LAST_SET])
+ last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
- if (last_set && tb[NFTA_LAST_MSECS]) {
+ if (last->set && tb[NFTA_LAST_MSECS]) {
err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
if (err < 0)
- return err;
+ goto err;
- priv->last_jiffies = jiffies - (unsigned long)last_jiffies;
+ last->jiffies = jiffies - (unsigned long)last_jiffies;
}
+ priv->last = last;
return 0;
+err:
+ kfree(last);
+
+ return err;
}
static void nft_last_eval(const struct nft_expr *expr,
struct nft_regs *regs, const struct nft_pktinfo *pkt)
{
struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
- if (READ_ONCE(priv->last_jiffies) != jiffies)
- WRITE_ONCE(priv->last_jiffies, jiffies);
- if (READ_ONCE(priv->last_set) == 0)
- WRITE_ONCE(priv->last_set, 1);
+ if (READ_ONCE(last->jiffies) != jiffies)
+ WRITE_ONCE(last->jiffies, jiffies);
+ if (READ_ONCE(last->set) == 0)
+ WRITE_ONCE(last->set, 1);
}
static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
struct nft_last_priv *priv = nft_expr_priv(expr);
- unsigned long last_jiffies = READ_ONCE(priv->last_jiffies);
- u32 last_set = READ_ONCE(priv->last_set);
+ struct nft_last *last = priv->last;
+ unsigned long last_jiffies = READ_ONCE(last->jiffies);
+ u32 last_set = READ_ONCE(last->set);
__be64 msecs;
if (time_before(jiffies, last_jiffies)) {
- WRITE_ONCE(priv->last_set, 0);
+ WRITE_ONCE(last->set, 0);
last_set = 0;
}
@@ -81,12 +93,34 @@ nla_put_failure:
return -1;
}
+static void nft_last_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+
+ kfree(priv->last);
+}
+
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_last_priv *priv_dst = nft_expr_priv(dst);
+
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
+ if (!priv_dst->last)
+ return -ENOMEM;
+
+ return 0;
+}
+
static const struct nft_expr_ops nft_last_ops = {
.type = &nft_last_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)),
.eval = nft_last_eval,
.init = nft_last_init,
+ .destroy = nft_last_destroy,
+ .clone = nft_last_clone,
.dump = nft_last_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_last_type __read_mostly = {
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 82ec27bdf941..981addb2d051 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -18,6 +18,10 @@ struct nft_limit {
spinlock_t lock;
u64 last;
u64 tokens;
+};
+
+struct nft_limit_priv {
+ struct nft_limit *limit;
u64 tokens_max;
u64 rate;
u64 nsecs;
@@ -25,33 +29,33 @@ struct nft_limit {
bool invert;
};
-static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
+static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
{
u64 now, tokens;
s64 delta;
- spin_lock_bh(&limit->lock);
+ spin_lock_bh(&priv->limit->lock);
now = ktime_get_ns();
- tokens = limit->tokens + now - limit->last;
- if (tokens > limit->tokens_max)
- tokens = limit->tokens_max;
+ tokens = priv->limit->tokens + now - priv->limit->last;
+ if (tokens > priv->tokens_max)
+ tokens = priv->tokens_max;
- limit->last = now;
+ priv->limit->last = now;
delta = tokens - cost;
if (delta >= 0) {
- limit->tokens = delta;
- spin_unlock_bh(&limit->lock);
- return limit->invert;
+ priv->limit->tokens = delta;
+ spin_unlock_bh(&priv->limit->lock);
+ return priv->invert;
}
- limit->tokens = tokens;
- spin_unlock_bh(&limit->lock);
- return !limit->invert;
+ priv->limit->tokens = tokens;
+ spin_unlock_bh(&priv->limit->lock);
+ return !priv->invert;
}
/* Use same default as in iptables. */
#define NFT_LIMIT_PKT_BURST_DEFAULT 5
-static int nft_limit_init(struct nft_limit *limit,
+static int nft_limit_init(struct nft_limit_priv *priv,
const struct nlattr * const tb[], bool pkts)
{
u64 unit, tokens;
@@ -60,58 +64,62 @@ static int nft_limit_init(struct nft_limit *limit,
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
- limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- limit->nsecs = unit * NSEC_PER_SEC;
- if (limit->rate == 0 || limit->nsecs < unit)
+ priv->nsecs = unit * NSEC_PER_SEC;
+ if (priv->rate == 0 || priv->nsecs < unit)
return -EOVERFLOW;
if (tb[NFTA_LIMIT_BURST])
- limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+ priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
- if (pkts && limit->burst == 0)
- limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+ if (pkts && priv->burst == 0)
+ priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
- if (limit->rate + limit->burst < limit->rate)
+ if (priv->rate + priv->burst < priv->rate)
return -EOVERFLOW;
if (pkts) {
- tokens = div64_u64(limit->nsecs, limit->rate) * limit->burst;
+ tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst;
} else {
/* The token bucket size limits the number of tokens can be
* accumulated. tokens_max specifies the bucket size.
* tokens_max = unit * (rate + burst) / rate.
*/
- tokens = div64_u64(limit->nsecs * (limit->rate + limit->burst),
- limit->rate);
+ tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst),
+ priv->rate);
}
- limit->tokens = tokens;
- limit->tokens_max = limit->tokens;
+ priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
+ if (!priv->limit)
+ return -ENOMEM;
+
+ priv->limit->tokens = tokens;
+ priv->tokens_max = priv->limit->tokens;
if (tb[NFTA_LIMIT_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
if (flags & NFT_LIMIT_F_INV)
- limit->invert = true;
+ priv->invert = true;
}
- limit->last = ktime_get_ns();
- spin_lock_init(&limit->lock);
+ priv->limit->last = ktime_get_ns();
+ spin_lock_init(&priv->limit->lock);
return 0;
}
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv,
enum nft_limit_type type)
{
- u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0;
- u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+ u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0;
+ u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC);
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate),
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate),
NFTA_LIMIT_PAD) ||
nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs),
NFTA_LIMIT_PAD) ||
- nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+ nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) ||
nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
goto nla_put_failure;
@@ -121,8 +129,34 @@ nla_put_failure:
return -1;
}
-struct nft_limit_pkts {
- struct nft_limit limit;
+static void nft_limit_destroy(const struct nft_ctx *ctx,
+ const struct nft_limit_priv *priv)
+{
+ kfree(priv->limit);
+}
+
+static int nft_limit_clone(struct nft_limit_priv *priv_dst,
+ const struct nft_limit_priv *priv_src)
+{
+ priv_dst->tokens_max = priv_src->tokens_max;
+ priv_dst->rate = priv_src->rate;
+ priv_dst->nsecs = priv_src->nsecs;
+ priv_dst->burst = priv_src->burst;
+ priv_dst->invert = priv_src->invert;
+
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
+ if (!priv_dst->limit)
+ return -ENOMEM;
+
+ spin_lock_init(&priv_dst->limit->lock);
+ priv_dst->limit->tokens = priv_src->tokens_max;
+ priv_dst->limit->last = ktime_get_ns();
+
+ return 0;
+}
+
+struct nft_limit_priv_pkts {
+ struct nft_limit_priv limit;
u64 cost;
};
@@ -130,7 +164,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -148,7 +182,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -161,25 +195,46 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
- const struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
+
+ priv_dst->cost = priv_src->cost;
+
+ return nft_limit_clone(&priv_dst->limit, &priv_src->limit);
+}
+
static struct nft_expr_type nft_limit_type;
static const struct nft_expr_ops nft_limit_pkts_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.eval = nft_limit_pkts_eval,
.init = nft_limit_pkts_init,
+ .destroy = nft_limit_pkts_destroy,
+ .clone = nft_limit_pkts_clone,
.dump = nft_limit_pkts_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static void nft_limit_bytes_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -190,7 +245,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_init(priv, tb, false);
}
@@ -198,17 +253,36 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
static int nft_limit_bytes_dump(struct sk_buff *skb,
const struct nft_expr *expr)
{
- const struct nft_limit *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, priv);
+}
+
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv *priv_src = nft_expr_priv(src);
+
+ return nft_limit_clone(priv_dst, priv_src);
+}
+
static const struct nft_expr_ops nft_limit_bytes_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)),
.eval = nft_limit_bytes_eval,
.init = nft_limit_bytes_init,
.dump = nft_limit_bytes_dump,
+ .clone = nft_limit_bytes_clone,
+ .destroy = nft_limit_bytes_destroy,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -240,7 +314,7 @@ static void nft_limit_obj_pkts_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -250,7 +324,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -265,16 +339,25 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit_pkts *priv = nft_obj_data(obj);
+ const struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_pkts_ops = {
.type = &nft_limit_obj_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.init = nft_limit_obj_pkts_init,
+ .destroy = nft_limit_obj_pkts_destroy,
.eval = nft_limit_obj_pkts_eval,
.dump = nft_limit_obj_pkts_dump,
};
@@ -283,7 +366,7 @@ static void nft_limit_obj_bytes_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -294,7 +377,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_init(priv, tb, false);
}
@@ -303,16 +386,25 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit *priv = nft_obj_data(obj);
+ const struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_bytes_ops = {
.type = &nft_limit_obj_type,
- .size = sizeof(struct nft_limit),
+ .size = sizeof(struct nft_limit_priv),
.init = nft_limit_obj_bytes_init,
+ .destroy = nft_limit_obj_bytes_destroy,
.eval = nft_limit_obj_bytes_eval,
.dump = nft_limit_obj_bytes_dump,
};
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 54f6c2035e84..0e13c003f0c1 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -290,6 +290,7 @@ static const struct nft_expr_ops nft_log_ops = {
.init = nft_log_init,
.destroy = nft_log_destroy,
.dump = nft_log_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_log_type __read_mostly = {
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 90becbf5bff3..dfae12759c7c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -253,6 +253,17 @@ static int nft_lookup_validate(const struct nft_ctx *ctx,
return 0;
}
+static bool nft_lookup_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+
+ if (priv->set->flags & NFT_SET_MAP)
+ nft_reg_track_cancel(track, priv->dreg, priv->set->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -263,6 +274,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
.validate = nft_lookup_validate,
+ .reduce = nft_lookup_reduce,
};
struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9953e8053753..2a0adc497bbb 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -129,6 +129,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
.destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
@@ -175,6 +176,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = {
.destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
@@ -230,6 +232,7 @@ static const struct nft_expr_ops nft_masq_inet_ops = {
.destroy = nft_masq_inet_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_inet_type __read_mostly = {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index fe91ff5f8fbe..55d2d49c3425 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -14,6 +14,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/random.h>
#include <linux/smp.h>
#include <linux/static_key.h>
#include <net/dst.h>
@@ -32,8 +33,6 @@
#define NFT_META_SECS_PER_DAY 86400
#define NFT_META_DAYS_PER_WEEK 7
-static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-
static u8 nft_meta_weekday(void)
{
time64_t secs = ktime_get_real_seconds();
@@ -271,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
return true;
}
-static noinline u32 nft_prandom_u32(void)
-{
- struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
-
- return prandom_u32_state(state);
-}
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static noinline bool
nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
@@ -389,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
#endif
case NFT_META_PRANDOM:
- *dest = nft_prandom_u32();
+ *dest = get_random_u32();
break;
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
@@ -518,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = IFNAMSIZ;
break;
case NFT_META_PRANDOM:
- prandom_init_once(&nft_prandom_state);
len = sizeof(u32);
break;
#ifdef CONFIG_XFRM
@@ -539,6 +530,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
+ priv->len = len;
return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, len);
}
@@ -664,6 +656,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
+ priv->len = len;
err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len);
if (err < 0)
return err;
@@ -750,16 +743,60 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
return 0;
}
+bool nft_meta_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct nft_meta *meta;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ meta = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != meta->key ||
+ priv->dreg != meta->dreg) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_reduce);
+
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
.eval = nft_meta_get_eval,
.init = nft_meta_get_init,
.dump = nft_meta_get_dump,
+ .reduce = nft_meta_get_reduce,
.validate = nft_meta_get_validate,
.offload = nft_meta_get_offload,
};
+static bool nft_meta_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_meta_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_meta_set_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -767,6 +804,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .reduce = nft_meta_set_reduce,
.validate = nft_meta_set_validate,
};
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index be1595d6979d..e5fd6995e4bf 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -317,6 +317,7 @@ static const struct nft_expr_ops nft_nat_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_nat_type __read_mostly = {
@@ -334,7 +335,8 @@ static void nft_nat_inet_eval(const struct nft_expr *expr,
{
const struct nft_nat *priv = nft_expr_priv(expr);
- if (priv->family == nft_pf(pkt))
+ if (priv->family == nft_pf(pkt) ||
+ priv->family == NFPROTO_INET)
nft_nat_eval(expr, regs, pkt);
}
@@ -346,6 +348,7 @@ static const struct nft_expr_ops nft_nat_inet_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_inet_nat_type __read_mostly = {
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 722cac1e90e0..45d3dc9e96f2 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -9,16 +9,15 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/random.h>
#include <linux/static_key.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
-
struct nft_ng_inc {
u8 dreg;
u32 modulus;
- atomic_t counter;
+ atomic_t *counter;
u32 offset;
};
@@ -27,9 +26,9 @@ static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
u32 nval, oval;
do {
- oval = atomic_read(&priv->counter);
+ oval = atomic_read(priv->counter);
nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
- } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+ } while (atomic_cmpxchg(priv->counter, oval, nval) != oval);
return nval + priv->offset;
}
@@ -55,6 +54,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_ng_inc *priv = nft_expr_priv(expr);
+ int err;
if (tb[NFTA_NG_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
@@ -66,10 +66,32 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- atomic_set(&priv->counter, priv->modulus - 1);
+ priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL);
+ if (!priv->counter)
+ return -ENOMEM;
- return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
- NULL, NFT_DATA_VALUE, sizeof(u32));
+ atomic_set(priv->counter, priv->modulus - 1);
+
+ err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(priv->counter);
+
+ return err;
+}
+
+static bool nft_ng_inc_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
}
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
@@ -98,18 +120,23 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static void nft_ng_inc_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ kfree(priv->counter);
+}
+
struct nft_ng_random {
u8 dreg;
u32 modulus;
u32 offset;
};
-static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+static u32 nft_ng_random_gen(const struct nft_ng_random *priv)
{
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-
- return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
- priv->offset;
+ return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset;
}
static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -137,8 +164,6 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- prandom_init_once(&nft_numgen_prandom_state);
-
return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, sizeof(u32));
}
@@ -151,13 +176,25 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static bool nft_ng_random_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
+}
+
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
.eval = nft_ng_inc_eval,
.init = nft_ng_inc_init,
+ .destroy = nft_ng_inc_destroy,
.dump = nft_ng_inc_dump,
+ .reduce = nft_ng_inc_reduce,
};
static const struct nft_expr_ops nft_ng_random_ops = {
@@ -166,6 +203,7 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.eval = nft_ng_random_eval,
.init = nft_ng_random_init,
.dump = nft_ng_random_dump,
+ .reduce = nft_ng_random_reduce,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 94b2327e71dc..5d8d91b3904d 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -91,6 +91,7 @@ static const struct nft_expr_ops nft_objref_ops = {
.activate = nft_objref_activate,
.deactivate = nft_objref_deactivate,
.dump = nft_objref_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_objref_map {
@@ -204,6 +205,7 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index d82677e83400..adacf95b6e2b 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -51,7 +51,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s",
data.genre, data.version);
else
- strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
+ strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
}
@@ -99,7 +99,7 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags)))
+ if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags)))
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
@@ -115,9 +115,45 @@ static int nft_osf_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data)
{
- return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_FORWARD));
+ unsigned int hooks;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_FORWARD);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static bool nft_osf_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ struct nft_osf *osf;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ osf = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->flags != osf->flags ||
+ priv->ttl != osf->ttl) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
}
static struct nft_expr_type nft_osf_type;
@@ -128,6 +164,7 @@ static const struct nft_expr_ops nft_osf_op = {
.dump = nft_osf_dump,
.type = &nft_osf_type,
.validate = nft_osf_validate,
+ .reduce = nft_osf_reduce,
};
static struct nft_expr_type nft_osf_type __read_mostly = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index bd689938a2e0..4edd899aeb9b 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -83,7 +83,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
{
unsigned int thoff = nft_thoff(pkt);
- if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
return -1;
switch (pkt->tprot) {
@@ -147,7 +147,7 @@ void nft_payload_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
goto err;
offset = nft_thoff(pkt);
break;
@@ -157,7 +157,8 @@ void nft_payload_eval(const struct nft_expr *expr,
goto err;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
offset += priv->offset;
@@ -172,10 +173,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
@@ -209,6 +210,31 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct nft_payload *payload;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ payload = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->base != payload->base ||
+ priv->offset != payload->offset ||
+ priv->len != payload->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static bool nft_payload_offload_mask(struct nft_offload_reg *reg,
u32 priv_len, u32 field_len)
{
@@ -512,6 +538,7 @@ static const struct nft_expr_ops nft_payload_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
.offload = nft_payload_offload,
};
@@ -521,6 +548,7 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
.offload = nft_payload_offload,
};
@@ -546,6 +574,9 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
struct sk_buff *skb,
unsigned int *l4csum_offset)
{
+ if (pkt->fragoff)
+ return -1;
+
switch (pkt->tprot) {
case IPPROTO_TCP:
*l4csum_offset = offsetof(struct tcphdr, check);
@@ -654,7 +685,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
goto err;
offset = nft_thoff(pkt);
break;
@@ -664,7 +695,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
goto err;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
csum_offset = offset + priv->csum_offset;
@@ -693,7 +725,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP &&
pkt->tprot == IPPROTO_SCTP &&
skb->ip_summed != CHECKSUM_PARTIAL) {
- if (nft_payload_csum_sctp(skb, nft_thoff(pkt)))
+ if (pkt->fragoff == 0 &&
+ nft_payload_csum_sctp(skb, nft_thoff(pkt)))
goto err;
}
@@ -707,17 +740,23 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_payload_set *priv = nft_expr_priv(expr);
+ u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
+ int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
- priv->csum_type =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
- if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
- priv->csum_offset =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
+ if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX,
+ &csum_offset);
+ if (err < 0)
+ return err;
+
+ priv->csum_offset = csum_offset;
+ }
if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
u32 flags;
@@ -728,7 +767,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
priv->csum_flags = flags;
}
- switch (priv->csum_type) {
+ switch (csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
case NFT_PAYLOAD_CSUM_INET:
break;
@@ -742,6 +781,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
default:
return -EOPNOTSUPP;
}
+ priv->csum_type = csum_type;
return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg,
priv->len);
@@ -766,12 +806,32 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_payload_ops &&
+ track->regs[i].selector->ops != &nft_payload_fast_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_payload_set_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)),
.eval = nft_payload_set_eval,
.init = nft_payload_set_init,
.dump = nft_payload_set_dump,
+ .reduce = nft_payload_set_reduce,
};
static const struct nft_expr_ops *
@@ -780,6 +840,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
{
enum nft_payload_bases base;
unsigned int offset, len;
+ int err;
if (tb[NFTA_PAYLOAD_BASE] == NULL ||
tb[NFTA_PAYLOAD_OFFSET] == NULL ||
@@ -806,8 +867,13 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
- len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) &&
base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER)
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 9ba1de51ac07..da29e92c03e2 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -68,6 +68,31 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static int nft_queue_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ break;
+ case NFPROTO_NETDEV: /* lacks okfn */
+ fallthrough;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, supported_hooks);
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
@@ -164,6 +189,8 @@ static const struct nft_expr_ops nft_queue_ops = {
.eval = nft_queue_eval,
.init = nft_queue_init,
.dump = nft_queue_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_queue_sreg_ops = {
@@ -172,6 +199,8 @@ static const struct nft_expr_ops nft_queue_sreg_ops = {
.eval = nft_queue_sreg_eval,
.init = nft_queue_sreg_init,
.dump = nft_queue_sreg_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c4d1389f7185..e6b0df68feea 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -15,13 +15,13 @@
struct nft_quota {
atomic64_t quota;
unsigned long flags;
- atomic64_t consumed;
+ atomic64_t *consumed;
};
static inline bool nft_overquota(struct nft_quota *priv,
const struct sk_buff *skb)
{
- return atomic64_add_return(skb->len, &priv->consumed) >=
+ return atomic64_add_return(skb->len, priv->consumed) >=
atomic64_read(&priv->quota);
}
@@ -90,13 +90,23 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
+ priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
+ if (!priv->consumed)
+ return -ENOMEM;
+
atomic64_set(&priv->quota, quota);
priv->flags = flags;
- atomic64_set(&priv->consumed, consumed);
+ atomic64_set(priv->consumed, consumed);
return 0;
}
+static void nft_quota_do_destroy(const struct nft_ctx *ctx,
+ struct nft_quota *priv)
+{
+ kfree(priv->consumed);
+}
+
static int nft_quota_obj_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
@@ -128,7 +138,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
* that we see, don't go over the quota boundary in what we send to
* userspace.
*/
- consumed = atomic64_read(&priv->consumed);
+ consumed = atomic64_read(priv->consumed);
quota = atomic64_read(&priv->quota);
if (consumed >= quota) {
consumed_cap = quota;
@@ -145,7 +155,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
goto nla_put_failure;
if (reset) {
- atomic64_sub(consumed, &priv->consumed);
+ atomic64_sub(consumed, priv->consumed);
clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
}
return 0;
@@ -162,11 +172,20 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
return nft_quota_do_dump(skb, priv, reset);
}
+static void nft_quota_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_quota_obj_type;
static const struct nft_object_ops nft_quota_obj_ops = {
.type = &nft_quota_obj_type,
.size = sizeof(struct nft_quota),
.init = nft_quota_obj_init,
+ .destroy = nft_quota_obj_destroy,
.eval = nft_quota_obj_eval,
.dump = nft_quota_obj_dump,
.update = nft_quota_obj_update,
@@ -205,13 +224,37 @@ static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
return nft_quota_do_dump(skb, priv, false);
}
+static void nft_quota_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_quota *priv_dst = nft_expr_priv(dst);
+
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
+ if (!priv_dst->consumed)
+ return -ENOMEM;
+
+ atomic64_set(priv_dst->consumed, 0);
+
+ return 0;
+}
+
static struct nft_expr_type nft_quota_type;
static const struct nft_expr_ops nft_quota_ops = {
.type = &nft_quota_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
.eval = nft_quota_eval,
.init = nft_quota_init,
+ .destroy = nft_quota_destroy,
+ .clone = nft_quota_clone,
.dump = nft_quota_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_quota_type __read_mostly = {
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index e4a1c44d7f51..832f0d725a9e 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
const struct nlattr * const tb[])
{
struct nft_range_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc_from, desc_to;
+ struct nft_data_desc desc_from = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_from),
+ };
+ struct nft_data_desc desc_to = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_to),
+ };
int err;
u32 op;
@@ -61,26 +68,16 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
!tb[NFTA_RANGE_TO_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
- &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+ err = nft_data_init(NULL, &priv->data_from, &desc_from,
+ tb[NFTA_RANGE_FROM_DATA]);
if (err < 0)
return err;
- if (desc_from.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err1;
- }
-
- err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
- &desc_to, tb[NFTA_RANGE_TO_DATA]);
+ err = nft_data_init(NULL, &priv->data_to, &desc_to,
+ tb[NFTA_RANGE_TO_DATA]);
if (err < 0)
goto err1;
- if (desc_to.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err2;
- }
-
if (desc_from.len != desc_to.len) {
err = -EINVAL;
goto err2;
@@ -140,6 +137,7 @@ static const struct nft_expr_ops nft_range_ops = {
.eval = nft_range_eval,
.init = nft_range_init,
.dump = nft_range_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_range_type __read_mostly = {
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index ba09890dddb5..5086adfe731c 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -134,6 +134,7 @@ static const struct nft_expr_ops nft_redir_ipv4_ops = {
.destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
@@ -183,6 +184,7 @@ static const struct nft_expr_ops nft_redir_ipv6_ops = {
.destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
@@ -225,6 +227,7 @@ static const struct nft_expr_ops nft_redir_inet_ops = {
.destroy = nft_redir_inet_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_inet_type __read_mostly = {
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 554caf967baa..973fa31a9dd6 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -80,6 +80,7 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.init = nft_reject_init,
.dump = nft_reject_dump,
.validate = nft_reject_inet_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c
index d89f68754f42..7865cd8b11bb 100644
--- a/net/netfilter/nft_reject_netdev.c
+++ b/net/netfilter/nft_reject_netdev.c
@@ -4,6 +4,7 @@
* Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net>
*/
+#include <linux/etherdevice.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -158,6 +159,7 @@ static const struct nft_expr_ops nft_reject_netdev_ops = {
.init = nft_reject_init,
.dump = nft_reject_dump,
.validate = nft_reject_netdev_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_netdev_type __read_mostly = {
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index bcd01a63e38f..71931ec91721 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -191,6 +191,7 @@ static const struct nft_expr_ops nft_rt_get_ops = {
.init = nft_rt_get_init,
.dump = nft_rt_get_dump,
.validate = nft_rt_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_rt_type __read_mostly = {
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index e7ae5914971e..96081ac8d2b4 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -21,7 +21,7 @@ struct nft_bitmap_elem {
* the element state in the current and the future generation.
*
* An element can be in three states. The generation cursor is represented using
- * the ^ character, note that this cursor shifts on every succesful transaction.
+ * the ^ character, note that this cursor shifts on every successful transaction.
* If no transaction is going on, we observe all elements are in the following
* state:
*
@@ -39,7 +39,7 @@ struct nft_bitmap_elem {
* 10 = this element is active in the current generation and it becomes inactive
* ^ in the next one. This happens when the element is deactivated but commit
* path has not yet been executed yet, so removal is still pending. On
- * transation abortion, the next generation bit is reset to go back to
+ * transaction abortion, the next generation bit is reset to go back to
* restore its previous state.
*/
struct nft_bitmap {
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index df40314de21f..76de6c8d9865 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -143,6 +143,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
/* Another cpu may race to insert the element with the same key */
if (prev) {
nft_set_elem_destroy(set, he, true);
+ atomic_dec(&set->nelems);
he = prev;
}
@@ -152,6 +153,7 @@ out:
err2:
nft_set_elem_destroy(set, he, true);
+ atomic_dec(&set->nelems);
err1:
return false;
}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index dce866d93fee..4f9299b9dcdd 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1290,6 +1290,11 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (!new->scratch_aligned)
goto out_scratch;
#endif
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(new->scratch, i) = NULL;
+
+ if (pipapo_realloc_scratch(new, old->bsize_max))
+ goto out_scratch_realloc;
rcu_head_init(&new->rcu);
@@ -1334,6 +1339,9 @@ out_lt:
kvfree(dst->lt);
dst--;
}
+out_scratch_realloc:
+ for_each_possible_cpu(i)
+ kfree(*per_cpu_ptr(new->scratch, i));
#ifdef NFT_PIPAPO_ALIGN
free_percpu(new->scratch_aligned);
#endif
@@ -2117,6 +2125,32 @@ out_scratch:
}
/**
+ * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array
+ * @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
+ */
+static void nft_set_pipapo_match_destroy(const struct nft_set *set,
+ struct nft_pipapo_match *m)
+{
+ struct nft_pipapo_field *f;
+ int i, r;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ e = f->mt[r].e;
+
+ nft_set_elem_destroy(set, e, true);
+ }
+}
+
+/**
* nft_pipapo_destroy() - Free private data for set and all committed elements
* @set: nftables API set representation
*/
@@ -2124,26 +2158,13 @@ static void nft_pipapo_destroy(const struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
- int i, r, cpu;
+ int cpu;
m = rcu_dereference_protected(priv->match, true);
if (m) {
rcu_barrier();
- for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
- ;
-
- for (r = 0; r < f->rules; r++) {
- struct nft_pipapo_elem *e;
-
- if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
- continue;
-
- e = f->mt[r].e;
-
- nft_set_elem_destroy(set, e, true);
- }
+ nft_set_pipapo_match_destroy(set, m);
#ifdef NFT_PIPAPO_ALIGN
free_percpu(m->scratch_aligned);
@@ -2157,6 +2178,11 @@ static void nft_pipapo_destroy(const struct nft_set *set)
}
if (priv->clone) {
+ m = priv->clone;
+
+ if (priv->dirty)
+ nft_set_pipapo_match_destroy(set, m);
+
#ifdef NFT_PIPAPO_ALIGN
free_percpu(priv->clone->scratch_aligned);
#endif
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index 6f4116e72958..52e0d026d30a 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1048,11 +1048,9 @@ static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
- unsigned long *lt = f->lt, bsize = f->bsize;
+ unsigned long bsize = f->bsize;
int i, ret = -1, b;
- lt += offset * NFT_PIPAPO_LONGS_PER_M256;
-
if (first)
memset(map, 0xff, bsize * sizeof(*map));
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index d600a566da32..7325bee7d144 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -349,7 +349,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
*ext = &rbe->ext;
return -EEXIST;
} else {
- p = &parent->rb_left;
+ overlap = false;
+ if (nft_rbtree_interval_end(rbe))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
}
}
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index d601974c9d2e..49a5348a6a14 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -10,6 +10,7 @@
struct nft_socket {
enum nft_socket_keys key:8;
u8 level;
+ u8 len;
union {
u8 dreg;
};
@@ -36,24 +37,50 @@ static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
#ifdef CONFIG_SOCK_CGROUP_DATA
static noinline bool
-nft_sock_get_eval_cgroupv2(u32 *dest, const struct nft_pktinfo *pkt, u32 level)
+nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
{
- struct sock *sk = skb_to_full_sk(pkt->skb);
struct cgroup *cgrp;
+ u64 cgid;
- if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
+ if (!sk_fullsock(sk))
return false;
- cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- if (level > cgrp->level)
+ cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
+ if (!cgrp)
return false;
- memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
-
+ cgid = cgroup_id(cgrp);
+ memcpy(dest, &cgid, sizeof(u64));
return true;
}
#endif
+static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
+{
+ const struct net_device *indev = nft_in(pkt);
+ const struct sk_buff *skb = pkt->skb;
+ struct sock *sk = NULL;
+
+ if (!indev)
+ return NULL;
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev);
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return sk;
+}
+
static void nft_socket_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -67,20 +94,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
sk = NULL;
if (!sk)
- switch(nft_pf(pkt)) {
- case NFPROTO_IPV4:
- sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
- break;
-#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
- case NFPROTO_IPV6:
- sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
- break;
-#endif
- default:
- WARN_ON_ONCE(1);
- regs->verdict.code = NFT_BREAK;
- return;
- }
+ sk = nft_socket_do_lookup(pkt);
if (!sk) {
regs->verdict.code = NFT_BREAK;
@@ -108,7 +122,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
break;
#ifdef CONFIG_SOCK_CGROUP_DATA
case NFT_SOCKET_CGROUPV2:
- if (!nft_sock_get_eval_cgroupv2(dest, pkt, priv->level)) {
+ if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -150,7 +164,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_SOCKET_KEY]));
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
case NFT_SOCKET_WILDCARD:
@@ -166,7 +180,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
if (!tb[NFTA_SOCKET_LEVEL])
return -EINVAL;
- level = ntohl(nla_get_u32(tb[NFTA_SOCKET_LEVEL]));
+ level = ntohl(nla_get_be32(tb[NFTA_SOCKET_LEVEL]));
if (level > 255)
return -EOPNOTSUPP;
@@ -179,6 +193,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
+ priv->len = len;
return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, len);
}
@@ -188,16 +203,51 @@ static int nft_socket_dump(struct sk_buff *skb,
{
const struct nft_socket *priv = nft_expr_priv(expr);
- if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+ if (nla_put_be32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
return -1;
if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
return -1;
if (priv->key == NFT_SOCKET_CGROUPV2 &&
- nla_put_u32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level)))
+ nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level)))
return -1;
return 0;
}
+static bool nft_socket_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+ const struct nft_socket *socket;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ socket = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != socket->key ||
+ priv->dreg != socket->dreg ||
+ priv->level != socket->level) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static int nft_socket_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+
static struct nft_expr_type nft_socket_type;
static const struct nft_expr_ops nft_socket_ops = {
.type = &nft_socket_type,
@@ -205,6 +255,8 @@ static const struct nft_expr_ops nft_socket_ops = {
.eval = nft_socket_eval,
.init = nft_socket_init,
.dump = nft_socket_dump,
+ .validate = nft_socket_validate,
+ .reduce = nft_socket_reduce,
};
static struct nft_expr_type nft_socket_type __read_mostly = {
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index a0109fa1e92d..6cf9a04fbfe2 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -191,8 +191,10 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
if (err)
goto nf_ct_failure;
err = nf_synproxy_ipv6_init(snet, ctx->net);
- if (err)
+ if (err) {
+ nf_synproxy_ipv4_fini(snet, ctx->net);
goto nf_ct_failure;
+ }
break;
}
@@ -286,6 +288,7 @@ static const struct nft_expr_ops nft_synproxy_ops = {
.dump = nft_synproxy_dump,
.type = &nft_synproxy_type,
.validate = nft_synproxy_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_synproxy_type __read_mostly = {
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index b5b09a902c7a..62da25ad264b 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -52,11 +52,11 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
if (priv->sreg_addr)
- taddr = regs->data[priv->sreg_addr];
+ taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]);
taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
if (priv->sreg_port)
- tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -124,7 +124,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
if (priv->sreg_port)
- tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -312,6 +312,13 @@ static int nft_tproxy_dump(struct sk_buff *skb,
return 0;
}
+static int nft_tproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+}
+
static struct nft_expr_type nft_tproxy_type;
static const struct nft_expr_ops nft_tproxy_ops = {
.type = &nft_tproxy_type,
@@ -320,6 +327,8 @@ static const struct nft_expr_ops nft_tproxy_ops = {
.init = nft_tproxy_init,
.destroy = nft_tproxy_destroy,
.dump = nft_tproxy_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .validate = nft_tproxy_validate,
};
static struct nft_expr_type nft_tproxy_type __read_mostly = {
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 3b27926d5382..983ade4be3b3 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -17,6 +17,7 @@ struct nft_tunnel {
enum nft_tunnel_keys key:8;
u8 dreg;
enum nft_tunnel_mode mode:8;
+ u8 len;
};
static void nft_tunnel_get_eval(const struct nft_expr *expr,
@@ -101,6 +102,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
priv->mode = NFT_TUNNEL_MODE_NONE;
}
+ priv->len = len;
return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, len);
}
@@ -122,6 +124,31 @@ nla_put_failure:
return -1;
}
+static bool nft_tunnel_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+ const struct nft_tunnel *tunnel;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ tunnel = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != tunnel->key ||
+ priv->dreg != tunnel->dreg ||
+ priv->mode != tunnel->mode) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_tunnel_type;
static const struct nft_expr_ops nft_tunnel_get_ops = {
.type = &nft_tunnel_type,
@@ -129,10 +156,12 @@ static const struct nft_expr_ops nft_tunnel_get_ops = {
.eval = nft_tunnel_get_eval,
.init = nft_tunnel_get_init,
.dump = nft_tunnel_get_dump,
+ .reduce = nft_tunnel_get_reduce,
};
static struct nft_expr_type nft_tunnel_type __read_mostly = {
.name = "tunnel",
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_get_ops,
.policy = nft_tunnel_policy,
.maxattr = NFTA_TUNNEL_MAX,
@@ -355,8 +384,9 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct ip_tunnel_info *info,
struct nft_tunnel_opts *opts)
{
- int err, rem, type = 0;
struct nlattr *nla;
+ __be16 type = 0;
+ int err, rem;
err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
nft_tunnel_opts_policy, NULL);
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index cbbbc4ecad3a..1c5343c936a8 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -27,6 +27,7 @@ struct nft_xfrm {
u8 dreg;
u8 dir;
u8 spnum;
+ u8 len;
};
static int nft_xfrm_get_init(const struct nft_ctx *ctx,
@@ -50,7 +51,7 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY]));
switch (priv->key) {
case NFT_XFRM_KEY_REQID:
case NFT_XFRM_KEY_SPI:
@@ -86,6 +87,7 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
priv->spnum = spnum;
+ priv->len = len;
return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg,
NULL, NFT_DATA_VALUE, len);
}
@@ -132,13 +134,13 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
WARN_ON_ONCE(1);
break;
case NFT_XFRM_KEY_DADDR_IP4:
- *dest = state->id.daddr.a4;
+ *dest = (__force __u32)state->id.daddr.a4;
return;
case NFT_XFRM_KEY_DADDR_IP6:
memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
return;
case NFT_XFRM_KEY_SADDR_IP4:
- *dest = state->props.saddr.a4;
+ *dest = (__force __u32)state->props.saddr.a4;
return;
case NFT_XFRM_KEY_SADDR_IP6:
memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
@@ -147,7 +149,7 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
*dest = state->props.reqid;
return;
case NFT_XFRM_KEY_SPI:
- *dest = state->id.spi;
+ *dest = (__force __u32)state->id.spi;
return;
}
@@ -252,6 +254,31 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+static bool nft_xfrm_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+ const struct nft_xfrm *xfrm;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ xfrm = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != xfrm->key ||
+ priv->dreg != xfrm->dreg ||
+ priv->dir != xfrm->dir ||
+ priv->spnum != xfrm->spnum) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
static struct nft_expr_type nft_xfrm_type;
static const struct nft_expr_ops nft_xfrm_get_ops = {
@@ -261,6 +288,7 @@ static const struct nft_expr_ops nft_xfrm_get_ops = {
.init = nft_xfrm_get_init,
.dump = nft_xfrm_get_dump,
.validate = nft_xfrm_validate,
+ .reduce = nft_xfrm_reduce,
};
static struct nft_expr_type nft_xfrm_type __read_mostly = {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 25524e393349..470282cf3fae 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -766,7 +766,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
msize += off;
m->u.user.match_size = msize;
- strlcpy(name, match->name, sizeof(name));
+ strscpy(name, match->name, sizeof(name));
module_put(match->me);
strncpy(m->u.user.name, name, sizeof(m->u.user.name));
@@ -1146,7 +1146,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
tsize += off;
t->u.user.target_size = tsize;
- strlcpy(name, target->name, sizeof(name));
+ strscpy(name, target->name, sizeof(name));
module_put(target->me);
strncpy(t->u.user.name, name, sizeof(t->u.user.name));
@@ -1517,7 +1517,7 @@ EXPORT_SYMBOL_GPL(xt_unregister_table);
#ifdef CONFIG_PROC_FS
static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
{
- u8 af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
struct xt_pernet *xt_net;
@@ -1529,7 +1529,7 @@ static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- u8 af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
struct xt_pernet *xt_net;
@@ -1540,7 +1540,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void xt_table_seq_stop(struct seq_file *seq, void *v)
{
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ u_int8_t af = (unsigned long)pde_data(file_inode(seq->file));
mutex_unlock(&xt[af].mutex);
}
@@ -1584,7 +1584,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
[MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
};
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
if (ppos != NULL)
@@ -1633,7 +1633,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
{
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
switch (trav->class) {
@@ -1827,7 +1827,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
root_uid = make_kuid(net->user_ns, 0);
root_gid = make_kgid(net->user_ns, 0);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops,
sizeof(struct seq_net_private),
@@ -1837,7 +1837,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_match_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1847,7 +1847,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_target_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1862,12 +1862,12 @@ int xt_proto_init(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
out_remove_matches:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out_remove_tables:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out:
@@ -1881,15 +1881,15 @@ void xt_proto_fini(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
char buf[XT_FUNCTION_MAXNAMELEN];
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
#endif /*CONFIG_PROC_FS*/
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 0a913ce07425..2be2f7a7b60f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
return XT_CONTINUE;
if (ct) {
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
} else {
nf_ct_set(skb, ct, IP_CT_UNTRACKED);
@@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
return -ENOMEM;
}
- help->helper = helper;
+ rcu_assign_pointer(help->helper, helper);
return 0;
}
@@ -136,6 +136,21 @@ static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
}
}
+static void xt_ct_put_helper(struct nf_conn_help *help)
+{
+ struct nf_conntrack_helper *helper;
+
+ if (!help)
+ return;
+
+ /* not yet exposed to other cpus, or ruleset
+ * already detached (post-replacement).
+ */
+ helper = rcu_dereference_raw(help->helper);
+ if (helper)
+ nf_conntrack_helper_put(helper);
+}
+
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
@@ -201,15 +216,13 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err4;
}
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
out:
info->ct = ct;
return 0;
err4:
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
err3:
nf_ct_tmpl_free(ct);
err2:
@@ -271,8 +284,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (ct) {
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
nf_ct_netns_put(par->net, par->family);
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index eababc354ff1..cfa44515ab72 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -24,6 +24,8 @@ MODULE_ALIAS("ip6t_DSCP");
MODULE_ALIAS("ipt_TOS");
MODULE_ALIAS("ip6t_TOS");
+#define XT_DSCP_ECN_MASK 3u
+
static unsigned int
dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -34,8 +36,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
- ipv4_change_dsfield(ip_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
@@ -52,8 +53,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
- ipv6_change_dsfield(ipv6_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 8aec1b529364..80f6624e2355 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -144,7 +144,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
goto err1;
gnet_stats_basic_sync_init(&est->bstats);
- strlcpy(est->name, info->name, sizeof(est->name));
+ strscpy(est->name, info->name, sizeof(est->name));
spin_lock_init(&est->lock);
est->refcnt = 1;
est->params.interval = info->interval;
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 122db9fbb9f4..116a885adb3c 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -239,8 +239,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
oldlen = ipv6h->payload_len;
newlen = htons(ntohs(oldlen) + ret);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(csum_sub(skb->csum, oldlen),
- newlen);
+ skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen),
+ (__force __wsum)newlen);
ipv6h->payload_len = newlen;
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 459d0696c91a..e4bea1d346cf 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -74,18 +74,10 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
-
- pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->daddr, ntohs(hp->dest),
- &laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
return NF_DROP;
}
@@ -122,16 +114,12 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int tproto;
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
- pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ if (tproto < 0)
return NF_DROP;
- }
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ if (!hp)
return NF_DROP;
- }
/* check if there's an ongoing connection on the packet
* addresses, this happens if the redirect already happened
@@ -168,19 +156,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
-
- pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
-
return NF_DROP;
}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 46fcac75f726..5d04ef80a61d 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -62,10 +62,10 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
key[4] = zone->id;
} else {
const struct iphdr *iph = ip_hdr(skb);
- key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
- iph->daddr : iph->saddr;
- key[0] &= info->mask.ip;
+ key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
+ (__force __u32)iph->daddr : (__force __u32)iph->saddr;
+ key[0] &= (__force __u32)info->mask.ip;
key[1] = zone->id;
}
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9c5cfd74a0ee..0859b8f76764 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1052,7 +1052,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
static void *dl_seq_start(struct seq_file *s, loff_t *pos)
__acquires(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket;
spin_lock_bh(&htable->lock);
@@ -1069,7 +1069,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
*pos = ++(*bucket);
@@ -1083,7 +1083,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void dl_seq_stop(struct seq_file *s, void *v)
__releases(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
if (!IS_ERR(bucket))
@@ -1125,7 +1125,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1140,7 +1140,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1155,7 +1155,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1169,7 +1169,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_show_v2(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
@@ -1183,7 +1183,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
static int dl_seq_show_v1(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
@@ -1197,7 +1197,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
static int dl_seq_show(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 0446307516cd..7ddb9a78e3fc 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -551,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file)
if (st == NULL)
return -ENOMEM;
- st->table = PDE_DATA(inode);
+ st->table = pde_data(inode);
return 0;
}
@@ -559,7 +559,7 @@ static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
- struct recent_table *t = PDE_DATA(file_inode(file));
+ struct recent_table *t = pde_data(file_inode(file));
struct recent_entry *e;
char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
const char *c = buf;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 5e6459e11605..7013f55f05d1 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -220,8 +220,10 @@ static void socket_mt_destroy(const struct xt_mtdtor_param *par)
{
if (par->family == NFPROTO_IPV4)
nf_defrag_ipv4_disable(par->net);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
else if (par->family == NFPROTO_IPV6)
- nf_defrag_ipv4_disable(par->net);
+ nf_defrag_ipv6_disable(par->net);
+#endif
}
static struct xt_match socket_mt_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 203e24ae472c..b26c1dcfc27b 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
- if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
+ if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH: