aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2022-04-27 10:45:55 +0100
committerDavid S. Miller <davem@davemloft.net>2022-04-27 10:45:55 +0100
commit124de27101ffec33a1eb6ccf0eff89a1a9b999a8 (patch)
treef1266f4174403249ce05925e3404f45e0dea8b4d
parentnet: stmmac: dwmac-imx: comment spelling fix (diff)
parentselftests: mptcp: print extra msg in chk_csum_nr (diff)
downloadlinux-dev-124de27101ffec33a1eb6ccf0eff89a1a9b999a8.tar.xz
linux-dev-124de27101ffec33a1eb6ccf0eff89a1a9b999a8.zip
Merge branch 'mptcp-MP_FAIL-timeout'
Mat Martineau says: ==================== mptcp: Timeout for MP_FAIL response When one peer sends an infinite mapping to coordinate fallback from MPTCP to regular TCP, the other peer is expected to send a packet with the MPTCP MP_FAIL option to acknowledge the infinite mapping. Rather than leave the connection in some half-fallback state, this series adds a timeout after which the infinite mapping sender will reset the connection. Patch 1 adds a fallback self test. Patches 2-5 make use of the MPTCP socket's retransmit timer to reset the MPTCP connection if no MP_FAIL was received. Patches 6 and 7 extends the self test to check MP_FAIL-related MIBs. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/mptcp/pm.c18
-rw-r--r--net/mptcp/protocol.c64
-rw-r--r--net/mptcp/protocol.h2
-rw-r--r--net/mptcp/subflow.c13
-rw-r--r--tools/testing/selftests/net/mptcp/config8
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh119
6 files changed, 216 insertions, 8 deletions
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 5c36870d3420..14f448d82bb2 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -287,11 +287,27 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct sock *s = (struct sock *)msk;
pr_debug("fail_seq=%llu", fail_seq);
- if (!mptcp_has_another_subflow(sk) && READ_ONCE(msk->allow_infinite_fallback))
+ if (mptcp_has_another_subflow(sk) || !READ_ONCE(msk->allow_infinite_fallback))
+ return;
+
+ if (!READ_ONCE(subflow->mp_fail_response_expect)) {
+ pr_debug("send MP_FAIL response and infinite map");
+
+ subflow->send_mp_fail = 1;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
subflow->send_infinite_map = 1;
+ } else if (s && inet_sk_state_load(s) != TCP_CLOSE) {
+ pr_debug("MP_FAIL response received");
+
+ mptcp_data_lock(s);
+ if (inet_sk_state_load(s) != TCP_CLOSE)
+ sk_stop_timer(s, &s->sk_timer);
+ mptcp_data_unlock(s);
+ }
}
/* path manager helpers */
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4581c570ef68..a5d466e6b538 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1605,8 +1605,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
out:
/* ensure the rtx timer is running */
+ mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
+ mptcp_data_unlock(sk);
if (copied)
__mptcp_check_send_data_fin(sk);
}
@@ -2167,10 +2169,38 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk);
}
+static struct mptcp_subflow_context *
+mp_fail_response_expect_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow, *ret = NULL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->mp_fail_response_expect)) {
+ ret = subflow;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void mptcp_check_mp_fail_response(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+
+ bh_lock_sock(sk);
+ subflow = mp_fail_response_expect_subflow(msk);
+ if (subflow)
+ __set_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags);
+ bh_unlock_sock(sk);
+}
+
static void mptcp_timeout_timer(struct timer_list *t)
{
struct sock *sk = from_timer(sk, t, sk_timer);
+ mptcp_check_mp_fail_response(mptcp_sk(sk));
mptcp_schedule_work(sk);
sock_put(sk);
}
@@ -2491,8 +2521,27 @@ static void __mptcp_retrans(struct sock *sk)
reset_timer:
mptcp_check_and_set_pending(sk);
+ mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
+ mptcp_data_unlock(sk);
+}
+
+static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *ssk;
+ bool slow;
+
+ subflow = mp_fail_response_expect_subflow(msk);
+ if (subflow) {
+ pr_debug("MP_FAIL doesn't respond, reset the subflow");
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
+ mptcp_subflow_reset(ssk);
+ unlock_sock_fast(ssk, slow);
+ }
}
static void mptcp_worker(struct work_struct *work)
@@ -2535,6 +2584,9 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
+ if (test_and_clear_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags))
+ mptcp_mp_fail_no_response(msk);
+
unlock:
release_sock(sk);
sock_put(sk);
@@ -2651,8 +2703,10 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
tcp_send_ack(ssk);
+ mptcp_data_lock(sk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
+ mptcp_data_unlock(sk);
}
break;
}
@@ -2753,8 +2807,10 @@ static void __mptcp_destroy_sock(struct sock *sk)
/* join list will be eventually flushed (with rst) at sock lock release time*/
list_splice_init(&msk->conn_list, &conn_list);
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ mptcp_data_lock(sk);
+ mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
+ mptcp_data_unlock(sk);
msk->pm.status = 0;
/* clears msk->subflow, allowing the following loop to close
@@ -2816,7 +2872,9 @@ cleanup:
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
+ mptcp_data_lock(sk);
sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
+ mptcp_data_unlock(sk);
}
release_sock(sk);
if (do_cancel_work)
@@ -2861,8 +2919,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
__mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);
}
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ mptcp_data_lock(sk);
+ mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
+ mptcp_data_unlock(sk);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 61d600693ffd..3a8740fef918 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -116,6 +116,7 @@
#define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5
+#define MPTCP_FAIL_NO_RESPONSE 6
/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING 1
@@ -448,6 +449,7 @@ struct mptcp_subflow_context {
stale : 1, /* unable to snd/rcv data, do not use for xmit */
local_id_valid : 1; /* local_id is correctly initialized */
enum mptcp_data_avail data_avail;
+ bool mp_fail_response_expect;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 30ffb00661bb..75c824b67ca9 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -968,6 +968,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool csum_reqd = READ_ONCE(msk->csum_enabled);
+ struct sock *sk = (struct sock *)msk;
struct mptcp_ext *mpext;
struct sk_buff *skb;
u16 data_len;
@@ -1009,6 +1010,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
subflow->map_data_len = 0;
+ if (sk && inet_sk_state_load(sk) != TCP_CLOSE) {
+ mptcp_data_lock(sk);
+ if (inet_sk_state_load(sk) != TCP_CLOSE)
+ sk_stop_timer(sk, &sk->sk_timer);
+ mptcp_data_unlock(sk);
+ }
return MAPPING_INVALID;
}
@@ -1217,6 +1224,12 @@ fallback:
tcp_send_active_reset(ssk, GFP_ATOMIC);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
+ } else {
+ WRITE_ONCE(subflow->mp_fail_response_expect, true);
+ /* The data lock is acquired in __mptcp_move_skbs() */
+ sk_reset_timer((struct sock *)msk,
+ &((struct sock *)msk)->sk_timer,
+ jiffies + TCP_RTO_MAX);
}
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return true;
diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
index d36b7da5082a..38021a0dd527 100644
--- a/tools/testing/selftests/net/mptcp/config
+++ b/tools/testing/selftests/net/mptcp/config
@@ -12,6 +12,9 @@ CONFIG_NF_TABLES=m
CONFIG_NFT_COMPAT=m
CONFIG_NETFILTER_XTABLES=m
CONFIG_NETFILTER_XT_MATCH_BPF=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
CONFIG_NF_TABLES_INET=y
CONFIG_NFT_TPROXY=m
CONFIG_NFT_SOCKET=m
@@ -19,3 +22,8 @@ CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_NF_TARGET_REJECT=m
CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_SCH_INGRESS=m
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 9eb4d889a24a..e5c8fc2816fb 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -266,6 +266,58 @@ reset_with_allow_join_id0()
ip netns exec $ns2 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns2_enable
}
+# Modify TCP payload without corrupting the TCP packet
+#
+# This rule inverts a 8-bit word at byte offset 148 for the 2nd TCP ACK packets
+# carrying enough data.
+# Once it is done, the TCP Checksum field is updated so the packet is still
+# considered as valid at the TCP level.
+# Because the MPTCP checksum, covering the TCP options and data, has not been
+# updated, the modification will be detected and an MP_FAIL will be emitted:
+# what we want to validate here without corrupting "random" MPTCP options.
+#
+# To avoid having tc producing this pr_info() message for each TCP ACK packets
+# not carrying enough data:
+#
+# tc action pedit offset 162 out of bounds
+#
+# Netfilter is used to mark packets with enough data.
+reset_with_fail()
+{
+ reset "${1}" || return 1
+
+ ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=1
+ ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=1
+
+ check_invert=1
+ validate_checksum=1
+ local i="$2"
+ local ip="${3:-4}"
+ local tables
+
+ tables="iptables"
+ if [ $ip -eq 6 ]; then
+ tables="ip6tables"
+ fi
+
+ ip netns exec $ns2 $tables \
+ -t mangle \
+ -A OUTPUT \
+ -o ns2eth$i \
+ -p tcp \
+ -m length --length 150:9999 \
+ -m statistic --mode nth --packet 1 --every 99999 \
+ -j MARK --set-mark 42 || exit 1
+
+ tc -n $ns2 qdisc add dev ns2eth$i clsact || exit 1
+ tc -n $ns2 filter add dev ns2eth$i egress \
+ protocol ip prio 1000 \
+ handle 42 fw \
+ action pedit munge offset 148 u8 invert \
+ pipe csum tcp \
+ index 100 || exit 1
+}
+
fail_test()
{
ret=1
@@ -961,6 +1013,7 @@ chk_csum_nr()
local csum_ns2=${2:-0}
local count
local dump_stats
+ local extra_msg=""
local allow_multi_errors_ns1=0
local allow_multi_errors_ns2=0
@@ -976,6 +1029,9 @@ chk_csum_nr()
printf "%-${nr_blank}s %s" " " "sum"
count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}')
[ -z "$count" ] && count=0
+ if [ "$count" != "$csum_ns1" ]; then
+ extra_msg="$extra_msg ns1=$count"
+ fi
if { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } ||
{ [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then
echo "[fail] got $count data checksum error[s] expected $csum_ns1"
@@ -987,28 +1043,58 @@ chk_csum_nr()
echo -n " - csum "
count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}')
[ -z "$count" ] && count=0
+ if [ "$count" != "$csum_ns2" ]; then
+ extra_msg="$extra_msg ns2=$count"
+ fi
if { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } ||
{ [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then
echo "[fail] got $count data checksum error[s] expected $csum_ns2"
fail_test
dump_stats=1
else
- echo "[ ok ]"
+ echo -n "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
+
+ echo "$extra_msg"
}
chk_fail_nr()
{
local fail_tx=$1
local fail_rx=$2
+ local ns_invert=${3:-""}
local count
local dump_stats
+ local ns_tx=$ns1
+ local ns_rx=$ns2
+ local extra_msg=""
+ local allow_tx_lost=0
+ local allow_rx_lost=0
+
+ if [[ $ns_invert = "invert" ]]; then
+ ns_tx=$ns2
+ ns_rx=$ns1
+ extra_msg=" invert"
+ fi
+
+ if [[ "${fail_tx}" = "-"* ]]; then
+ allow_tx_lost=1
+ fail_tx=${fail_tx:1}
+ fi
+ if [[ "${fail_rx}" = "-"* ]]; then
+ allow_rx_lost=1
+ fail_rx=${fail_rx:1}
+ fi
printf "%-${nr_blank}s %s" " " "ftx"
- count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}')
+ count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$fail_tx" ]; then
+ extra_msg="$extra_msg,tx=$count"
+ fi
+ if { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } ||
+ { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then
echo "[fail] got $count MP_FAIL[s] TX expected $fail_tx"
fail_test
dump_stats=1
@@ -1017,17 +1103,23 @@ chk_fail_nr()
fi
echo -n " - failrx"
- count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}')
+ count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}')
[ -z "$count" ] && count=0
if [ "$count" != "$fail_rx" ]; then
+ extra_msg="$extra_msg,rx=$count"
+ fi
+ if { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } ||
+ { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then
echo "[fail] got $count MP_FAIL[s] RX expected $fail_rx"
fail_test
dump_stats=1
else
- echo "[ ok ]"
+ echo -n "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
+
+ echo "$extra_msg"
}
chk_fclose_nr()
@@ -1199,7 +1291,7 @@ chk_join_nr()
echo "[ ok ]"
fi
[ "${dump_stats}" = 1 ] && dump_stats
- if [ $checksum -eq 1 ]; then
+ if [ $validate_checksum -eq 1 ]; then
chk_csum_nr $csum_ns1 $csum_ns2
chk_fail_nr $fail_nr $fail_nr
chk_rst_nr $rst_nr $rst_nr
@@ -2590,6 +2682,22 @@ fastclose_tests()
fi
}
+pedit_action_pkts()
+{
+ tc -n $ns2 -j -s action show action pedit index 100 | \
+ sed 's/.*"packets":\([0-9]\+\),.*/\1/'
+}
+
+fail_tests()
+{
+ # single subflow
+ if reset_with_fail "Infinite map" 1; then
+ run_tests $ns1 $ns2 10.0.1.1 128
+ chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)"
+ chk_fail_nr 1 -1 invert
+ fi
+}
+
implicit_tests()
{
# userspace pm type prevents add_addr
@@ -2658,6 +2766,7 @@ all_tests_sorted=(
d@deny_join_id0_tests
m@fullmesh_tests
z@fastclose_tests
+ F@fail_tests
I@implicit_tests
)