aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/nf_conntrack-sysctl.rst6
-rw-r--r--include/net/netfilter/nf_conntrack_core.h1
-rw-r--r--include/net/netns/conntrack.h1
-rw-r--r--include/uapi/linux/netfilter/nfnetlink_log.h2
-rw-r--r--include/uapi/linux/netfilter/nfnetlink_queue.h4
-rw-r--r--net/netfilter/nf_conntrack_core.c11
-rw-r--r--net/netfilter/nf_conntrack_netlink.c3
-rw-r--r--net/netfilter/nf_conntrack_proto.c7
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c13
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c69
-rw-r--r--net/netfilter/nf_conntrack_standalone.c10
-rw-r--r--net/netfilter/nf_tables_api.c3
-rw-r--r--net/netfilter/nft_last.c12
-rw-r--r--tools/testing/selftests/netfilter/Makefile2
-rwxr-xr-xtools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh167
15 files changed, 262 insertions, 49 deletions
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 0467b30e4abe..d31ed6c1cb0d 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -110,6 +110,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
Be conservative in what you do, be liberal in what you accept from others.
If it's non-zero, we mark only out of window RST segments as INVALID.
+nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
+ - 0 - disabled (default)
+ - 1 - enabled
+
+ If it's 1, we don't mark out of window RST segments as INVALID.
+
nf_conntrack_tcp_loose - BOOLEAN
- 0 - disabled
- not 0 - enabled (default)
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 09f2efea0b97..13807ea94cd2 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net);
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
void nf_conntrack_proto_pernet_init(struct net *net);
-void nf_conntrack_proto_pernet_fini(struct net *net);
int nf_conntrack_proto_init(void);
void nf_conntrack_proto_fini(void);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index c3094b83a525..37e5300c7e5a 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -27,6 +27,7 @@ struct nf_tcp_net {
u8 tcp_loose;
u8 tcp_be_liberal;
u8 tcp_max_retrans;
+ u8 tcp_ignore_invalid_rst;
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
unsigned int offload_timeout;
unsigned int offload_pickup;
diff --git a/include/uapi/linux/netfilter/nfnetlink_log.h b/include/uapi/linux/netfilter/nfnetlink_log.h
index 45c8d3b027e0..0af9c113d665 100644
--- a/include/uapi/linux/netfilter/nfnetlink_log.h
+++ b/include/uapi/linux/netfilter/nfnetlink_log.h
@@ -61,7 +61,7 @@ enum nfulnl_attr_type {
NFULA_HWTYPE, /* hardware type */
NFULA_HWHEADER, /* hardware header */
NFULA_HWLEN, /* hardware header length */
- NFULA_CT, /* nf_conntrack_netlink.h */
+ NFULA_CT, /* nfnetlink_conntrack.h */
NFULA_CT_INFO, /* enum ip_conntrack_info */
NFULA_VLAN, /* nested attribute: packet vlan info */
NFULA_L2HDR, /* full L2 header */
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index bcb2cb5d40b9..aed90c4df0c8 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -51,11 +51,11 @@ enum nfqnl_attr_type {
NFQA_IFINDEX_PHYSOUTDEV, /* __u32 ifindex */
NFQA_HWADDR, /* nfqnl_msg_packet_hw */
NFQA_PAYLOAD, /* opaque data payload */
- NFQA_CT, /* nf_conntrack_netlink.h */
+ NFQA_CT, /* nfnetlink_conntrack.h */
NFQA_CT_INFO, /* enum ip_conntrack_info */
NFQA_CAP_LEN, /* __u32 length of captured packet */
NFQA_SKB_INFO, /* __u32 skb meta information */
- NFQA_EXP, /* nf_conntrack_netlink.h */
+ NFQA_EXP, /* nfnetlink_conntrack.h */
NFQA_UID, /* __u32 sk uid */
NFQA_GID, /* __u32 sk gid */
NFQA_SECCTX, /* security context string */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 96ba19fc8155..83c52df85870 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -149,7 +149,15 @@ static void nf_conntrack_all_lock(void)
spin_lock(&nf_conntrack_locks_all_lock);
- nf_conntrack_locks_all = true;
+ /* For nf_contrack_locks_all, only the latest time when another
+ * CPU will see an update is controlled, by the "release" of the
+ * spin_lock below.
+ * The earliest time is not controlled, an thus KCSAN could detect
+ * a race when nf_conntract_lock() reads the variable.
+ * WRITE_ONCE() is used to ensure the compiler will not
+ * optimize the write.
+ */
+ WRITE_ONCE(nf_conntrack_locks_all, true);
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
@@ -2457,7 +2465,6 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_conntrack_proto_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4e1a9dba7077..e81af33b233b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -218,6 +218,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
if (!help)
return 0;
+ rcu_read_lock();
helper = rcu_dereference(help->helper);
if (!helper)
goto out;
@@ -233,9 +234,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
nla_nest_end(skb, nest_helper);
out:
+ rcu_read_unlock();
return 0;
nla_put_failure:
+ rcu_read_unlock();
return -1;
}
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 55647409a9be..8f7a9837349c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
#endif
}
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
-#ifdef CONFIG_NF_CT_PROTO_GRE
- nf_ct_gre_keymap_flush(net);
-#endif
-}
-
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index db11e403d818..728eeb0aea87 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
return &net->ct.nf_ct_proto.gre;
}
-void nf_ct_gre_keymap_flush(struct net *net)
-{
- struct nf_gre_net *net_gre = gre_pernet(net);
- struct nf_ct_gre_keymap *km, *tmp;
-
- spin_lock_bh(&keymap_lock);
- list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
- list_del_rcu(&km->list);
- kfree_rcu(km, rcu);
- }
- spin_unlock_bh(&keymap_lock);
-}
-
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
const struct nf_conntrack_tuple *t)
{
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index f7e8baf59b51..3259416f2ea4 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -823,6 +823,22 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
return true;
}
+static bool tcp_can_early_drop(const struct nf_conn *ct)
+{
+ switch (ct->proto.tcp.state) {
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ case TCP_CONNTRACK_CLOSE:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
/* Returns verdict for packet, or -1 for invalid. */
int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -1030,10 +1046,30 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
if (index != TCP_RST_SET)
break;
- if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
+ /* If we are closing, tuple might have been re-used already.
+ * last_index, last_ack, and all other ct fields used for
+ * sequence/window validation are outdated in that case.
+ *
+ * As the conntrack can already be expired by GC under pressure,
+ * just skip validation checks.
+ */
+ if (tcp_can_early_drop(ct))
+ goto in_window;
+
+ /* td_maxack might be outdated if we let a SYN through earlier */
+ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
+ ct->proto.tcp.last_index != TCP_SYN_SET) {
u32 seq = ntohl(th->seq);
- if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
+ /* If we are not in established state and SEQ=0 this is most
+ * likely an answer to a SYN we let go through above (last_index
+ * can be updated due to out-of-order ACKs).
+ */
+ if (seq == 0 && !nf_conntrack_tcp_established(ct))
+ break;
+
+ if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+ !tn->tcp_ignore_invalid_rst) {
/* Invalid RST */
spin_unlock_bh(&ct->lock);
nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
@@ -1134,6 +1170,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
+
+ if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
+ /* do not renew timeout on SYN retransmit.
+ *
+ * Else port reuse by client or NAT middlebox can keep
+ * entry alive indefinitely (including nat info).
+ */
+ return NF_ACCEPT;
+ }
+
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
@@ -1155,22 +1201,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-static bool tcp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.tcp.state) {
- case TCP_CONNTRACK_FIN_WAIT:
- case TCP_CONNTRACK_LAST_ACK:
- case TCP_CONNTRACK_TIME_WAIT:
- case TCP_CONNTRACK_CLOSE:
- case TCP_CONNTRACK_CLOSE_WAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -1437,6 +1467,9 @@ void nf_conntrack_tcp_init_net(struct net *net)
*/
tn->tcp_be_liberal = 0;
+ /* If it's non-zero, we turn off RST sequence number check */
+ tn->tcp_ignore_invalid_rst = 0;
+
/* Max number of the retransmitted packets without receiving an (acceptable)
* ACK from the destination. If this number is reached, a shorter timer
* will be started.
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f57a951c9b5e..214d9f9e499b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -579,6 +579,7 @@ enum nf_ct_sysctl_index {
#endif
NF_SYSCTL_CT_PROTO_TCP_LOOSE,
NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+ NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
@@ -798,6 +799,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+ .procname = "nf_conntrack_tcp_ignore_invalid_rst",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
.procname = "nf_conntrack_tcp_max_retrans",
.maxlen = sizeof(u8),
@@ -1004,6 +1013,7 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
XASSIGN(LOOSE, &tn->tcp_loose);
XASSIGN(LIBERAL, &tn->tcp_be_liberal);
XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+ XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
#undef XASSIGN
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 390d4466567f..de182d1f7c4e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3446,7 +3446,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return 0;
err_destroy_flow_rule:
- nft_flow_rule_destroy(flow);
+ if (flow)
+ nft_flow_rule_destroy(flow);
err_release_rule:
nf_tables_rule_release(&ctx, rule);
err_release_expr:
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 913ac45167f2..8088b99f2ee3 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -23,15 +23,21 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
{
struct nft_last_priv *priv = nft_expr_priv(expr);
u64 last_jiffies;
+ u32 last_set = 0;
int err;
- if (tb[NFTA_LAST_MSECS]) {
+ if (tb[NFTA_LAST_SET]) {
+ last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
+ if (last_set == 1)
+ priv->last_set = 1;
+ }
+
+ if (last_set && tb[NFTA_LAST_MSECS]) {
err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
if (err < 0)
return err;
- priv->last_jiffies = jiffies + (unsigned long)last_jiffies;
- priv->last_set = 1;
+ priv->last_jiffies = jiffies - (unsigned long)last_jiffies;
}
return 0;
diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
index cd6430b39982..8748199ac109 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -5,7 +5,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
nft_concat_range.sh nft_conntrack_helper.sh \
nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
- ipip-conntrack-mtu.sh
+ ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
LDLIBS = -lmnl
TEST_GEN_FILES = nf-queue
diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
new file mode 100755
index 000000000000..e7d7bf13cff5
--- /dev/null
+++ b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that UNREPLIED tcp conntrack will eventually timeout.
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+waittime=20
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+cleanup() {
+ ip netns pids $ns1 | xargs kill 2>/dev/null
+ ip netns pids $ns2 | xargs kill 2>/dev/null
+
+ ip netns del $ns1
+ ip netns del $ns2
+}
+
+ipv4() {
+ echo -n 192.168.$1.2
+}
+
+check_counter()
+{
+ ns=$1
+ name=$2
+ expect=$3
+ local lret=0
+
+ cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
+ ip netns exec $ns2 nft list counter inet filter "$name" 1>&2
+ lret=1
+ fi
+
+ return $lret
+}
+
+# Create test namespaces
+ip netns add $ns1 || exit 1
+
+trap cleanup EXIT
+
+ip netns add $ns2 || exit 1
+
+# Connect the namespace to the host using a veth pair
+ip -net $ns1 link add name veth1 type veth peer name veth2
+ip -net $ns1 link set netns $ns2 dev veth2
+
+ip -net $ns1 link set up dev lo
+ip -net $ns2 link set up dev lo
+ip -net $ns1 link set up dev veth1
+ip -net $ns2 link set up dev veth2
+
+ip -net $ns2 addr add 10.11.11.2/24 dev veth2
+ip -net $ns2 route add default via 10.11.11.1
+
+ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1
+
+# add a rule inside NS so we enable conntrack
+ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT
+
+ip -net $ns1 addr add 10.11.11.1/24 dev veth1
+ip -net $ns1 route add 10.99.99.99 via 10.11.11.2
+
+# Check connectivity works
+ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1
+
+ip netns exec $ns2 nc -l -p 8080 < /dev/null &
+
+# however, conntrack entries are there
+
+ip netns exec $ns2 nft -f - <<EOF
+table inet filter {
+ counter connreq { }
+ counter redir { }
+ chain input {
+ type filter hook input priority 0; policy accept;
+ ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
+ ct state new ct status dnat tcp dport 8080 counter name "redir" accept
+ }
+}
+EOF
+if [ $? -ne 0 ]; then
+ echo "ERROR: Could not load nft rules"
+ exit 1
+fi
+
+ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
+
+echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
+ip netns exec $ns1 bash -c 'while true ; do
+ nc -p 60000 10.99.99.99 80
+ sleep 1
+ done' &
+
+sleep 1
+
+ip netns exec $ns2 nft -f - <<EOF
+table inet nat {
+ chain prerouting {
+ type nat hook prerouting priority 0; policy accept;
+ ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
+ }
+}
+EOF
+if [ $? -ne 0 ]; then
+ echo "ERROR: Could not load nat redirect"
+ exit 1
+fi
+
+count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
+if [ $count -eq 0 ]; then
+ echo "ERROR: $ns2 did not pick up tcp connection from peer"
+ exit 1
+fi
+
+echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect"
+for i in $(seq 1 $waittime); do
+ echo -n "."
+
+ sleep 1
+
+ count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
+ if [ $count -gt 0 ]; then
+ echo
+ echo "PASS: redirection took effect after $i seconds"
+ break
+ fi
+
+ m=$((i%20))
+ if [ $m -eq 0 ]; then
+ echo " waited for $i seconds"
+ fi
+done
+
+expect="packets 1 bytes 60"
+check_counter "$ns2" "redir" "$expect"
+if [ $? -ne 0 ]; then
+ ret=1
+fi
+
+if [ $ret -eq 0 ];then
+ echo "PASS: redirection counter has expected values"
+else
+ echo "ERROR: no tcp connection was redirected"
+fi
+
+exit $ret