diff options
author | David S. Miller <davem@davemloft.net> | 2022-04-27 10:45:55 +0100 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2022-04-27 10:45:55 +0100 |
commit | 124de27101ffec33a1eb6ccf0eff89a1a9b999a8 (patch) | |
tree | f1266f4174403249ce05925e3404f45e0dea8b4d | |
parent | net: stmmac: dwmac-imx: comment spelling fix (diff) | |
parent | selftests: mptcp: print extra msg in chk_csum_nr (diff) | |
download | linux-dev-124de27101ffec33a1eb6ccf0eff89a1a9b999a8.tar.xz linux-dev-124de27101ffec33a1eb6ccf0eff89a1a9b999a8.zip |
Merge branch 'mptcp-MP_FAIL-timeout'
Mat Martineau says:
====================
mptcp: Timeout for MP_FAIL response
When one peer sends an infinite mapping to coordinate fallback from
MPTCP to regular TCP, the other peer is expected to send a packet with
the MPTCP MP_FAIL option to acknowledge the infinite mapping. Rather
than leave the connection in some half-fallback state, this series adds
a timeout after which the infinite mapping sender will reset the
connection.
Patch 1 adds a fallback self test.
Patches 2-5 make use of the MPTCP socket's retransmit timer to reset the
MPTCP connection if no MP_FAIL was received.
Patches 6 and 7 extends the self test to check MP_FAIL-related MIBs.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/mptcp/pm.c | 18 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 64 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 2 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 13 | ||||
-rw-r--r-- | tools/testing/selftests/net/mptcp/config | 8 | ||||
-rwxr-xr-x | tools/testing/selftests/net/mptcp/mptcp_join.sh | 119 |
6 files changed, 216 insertions, 8 deletions
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 5c36870d3420..14f448d82bb2 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -287,11 +287,27 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct sock *s = (struct sock *)msk; pr_debug("fail_seq=%llu", fail_seq); - if (!mptcp_has_another_subflow(sk) && READ_ONCE(msk->allow_infinite_fallback)) + if (mptcp_has_another_subflow(sk) || !READ_ONCE(msk->allow_infinite_fallback)) + return; + + if (!READ_ONCE(subflow->mp_fail_response_expect)) { + pr_debug("send MP_FAIL response and infinite map"); + + subflow->send_mp_fail = 1; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); subflow->send_infinite_map = 1; + } else if (s && inet_sk_state_load(s) != TCP_CLOSE) { + pr_debug("MP_FAIL response received"); + + mptcp_data_lock(s); + if (inet_sk_state_load(s) != TCP_CLOSE) + sk_stop_timer(s, &s->sk_timer); + mptcp_data_unlock(s); + } } /* path manager helpers */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4581c570ef68..a5d466e6b538 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1605,8 +1605,10 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) out: /* ensure the rtx timer is running */ + mptcp_data_lock(sk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); + mptcp_data_unlock(sk); if (copied) __mptcp_check_send_data_fin(sk); } @@ -2167,10 +2169,38 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } +static struct mptcp_subflow_context * +mp_fail_response_expect_subflow(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *ret = NULL; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->mp_fail_response_expect)) { + ret = subflow; + break; + } + } + + return ret; +} + +static void mptcp_check_mp_fail_response(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + bh_lock_sock(sk); + subflow = mp_fail_response_expect_subflow(msk); + if (subflow) + __set_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags); + bh_unlock_sock(sk); +} + static void mptcp_timeout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); + mptcp_check_mp_fail_response(mptcp_sk(sk)); mptcp_schedule_work(sk); sock_put(sk); } @@ -2491,8 +2521,27 @@ static void __mptcp_retrans(struct sock *sk) reset_timer: mptcp_check_and_set_pending(sk); + mptcp_data_lock(sk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); + mptcp_data_unlock(sk); +} + +static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *ssk; + bool slow; + + subflow = mp_fail_response_expect_subflow(msk); + if (subflow) { + pr_debug("MP_FAIL doesn't respond, reset the subflow"); + + ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + unlock_sock_fast(ssk, slow); + } } static void mptcp_worker(struct work_struct *work) @@ -2535,6 +2584,9 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); + if (test_and_clear_bit(MPTCP_FAIL_NO_RESPONSE, &msk->flags)) + mptcp_mp_fail_no_response(msk); + unlock: release_sock(sk); sock_put(sk); @@ -2651,8 +2703,10 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); + mptcp_data_lock(sk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); + mptcp_data_unlock(sk); } break; } @@ -2753,8 +2807,10 @@ static void __mptcp_destroy_sock(struct sock *sk) /* join list will be eventually flushed (with rst) at sock lock release time*/ list_splice_init(&msk->conn_list, &conn_list); - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + mptcp_data_lock(sk); + mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); + mptcp_data_unlock(sk); msk->pm.status = 0; /* clears msk->subflow, allowing the following loop to close @@ -2816,7 +2872,9 @@ cleanup: __mptcp_destroy_sock(sk); do_cancel_work = true; } else { + mptcp_data_lock(sk); sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + mptcp_data_unlock(sk); } release_sock(sk); if (do_cancel_work) @@ -2861,8 +2919,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); } - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + mptcp_data_lock(sk); + mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); + mptcp_data_unlock(sk); if (mptcp_sk(sk)->token) mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 61d600693ffd..3a8740fef918 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -116,6 +116,7 @@ #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_FAIL_NO_RESPONSE 6 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 @@ -448,6 +449,7 @@ struct mptcp_subflow_context { stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1; /* local_id is correctly initialized */ enum mptcp_data_avail data_avail; + bool mp_fail_response_expect; u32 remote_nonce; u64 thmac; u32 local_nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 30ffb00661bb..75c824b67ca9 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -968,6 +968,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); + struct sock *sk = (struct sock *)msk; struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -1009,6 +1010,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk, pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; + if (sk && inet_sk_state_load(sk) != TCP_CLOSE) { + mptcp_data_lock(sk); + if (inet_sk_state_load(sk) != TCP_CLOSE) + sk_stop_timer(sk, &sk->sk_timer); + mptcp_data_unlock(sk); + } return MAPPING_INVALID; } @@ -1217,6 +1224,12 @@ fallback: tcp_send_active_reset(ssk, GFP_ATOMIC); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); + } else { + WRITE_ONCE(subflow->mp_fail_response_expect, true); + /* The data lock is acquired in __mptcp_move_skbs() */ + sk_reset_timer((struct sock *)msk, + &((struct sock *)msk)->sk_timer, + jiffies + TCP_RTO_MAX); } WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return true; diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config index d36b7da5082a..38021a0dd527 100644 --- a/tools/testing/selftests/net/mptcp/config +++ b/tools/testing/selftests/net/mptcp/config @@ -12,6 +12,9 @@ CONFIG_NF_TABLES=m CONFIG_NFT_COMPAT=m CONFIG_NETFILTER_XTABLES=m CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_TARGET_MARK=m CONFIG_NF_TABLES_INET=y CONFIG_NFT_TPROXY=m CONFIG_NFT_SOCKET=m @@ -19,3 +22,8 @@ CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_NF_TARGET_REJECT=m CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_SCH_INGRESS=m diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9eb4d889a24a..e5c8fc2816fb 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -266,6 +266,58 @@ reset_with_allow_join_id0() ip netns exec $ns2 sysctl -q net.mptcp.allow_join_initial_addr_port=$ns2_enable } +# Modify TCP payload without corrupting the TCP packet +# +# This rule inverts a 8-bit word at byte offset 148 for the 2nd TCP ACK packets +# carrying enough data. +# Once it is done, the TCP Checksum field is updated so the packet is still +# considered as valid at the TCP level. +# Because the MPTCP checksum, covering the TCP options and data, has not been +# updated, the modification will be detected and an MP_FAIL will be emitted: +# what we want to validate here without corrupting "random" MPTCP options. +# +# To avoid having tc producing this pr_info() message for each TCP ACK packets +# not carrying enough data: +# +# tc action pedit offset 162 out of bounds +# +# Netfilter is used to mark packets with enough data. +reset_with_fail() +{ + reset "${1}" || return 1 + + ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=1 + ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=1 + + check_invert=1 + validate_checksum=1 + local i="$2" + local ip="${3:-4}" + local tables + + tables="iptables" + if [ $ip -eq 6 ]; then + tables="ip6tables" + fi + + ip netns exec $ns2 $tables \ + -t mangle \ + -A OUTPUT \ + -o ns2eth$i \ + -p tcp \ + -m length --length 150:9999 \ + -m statistic --mode nth --packet 1 --every 99999 \ + -j MARK --set-mark 42 || exit 1 + + tc -n $ns2 qdisc add dev ns2eth$i clsact || exit 1 + tc -n $ns2 filter add dev ns2eth$i egress \ + protocol ip prio 1000 \ + handle 42 fw \ + action pedit munge offset 148 u8 invert \ + pipe csum tcp \ + index 100 || exit 1 +} + fail_test() { ret=1 @@ -961,6 +1013,7 @@ chk_csum_nr() local csum_ns2=${2:-0} local count local dump_stats + local extra_msg="" local allow_multi_errors_ns1=0 local allow_multi_errors_ns2=0 @@ -976,6 +1029,9 @@ chk_csum_nr() printf "%-${nr_blank}s %s" " " "sum" count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}') [ -z "$count" ] && count=0 + if [ "$count" != "$csum_ns1" ]; then + extra_msg="$extra_msg ns1=$count" + fi if { [ "$count" != $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 0 ]; } || { [ "$count" -lt $csum_ns1 ] && [ $allow_multi_errors_ns1 -eq 1 ]; }; then echo "[fail] got $count data checksum error[s] expected $csum_ns1" @@ -987,28 +1043,58 @@ chk_csum_nr() echo -n " - csum " count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtDataCsumErr | awk '{print $2}') [ -z "$count" ] && count=0 + if [ "$count" != "$csum_ns2" ]; then + extra_msg="$extra_msg ns2=$count" + fi if { [ "$count" != $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 0 ]; } || { [ "$count" -lt $csum_ns2 ] && [ $allow_multi_errors_ns2 -eq 1 ]; }; then echo "[fail] got $count data checksum error[s] expected $csum_ns2" fail_test dump_stats=1 else - echo "[ ok ]" + echo -n "[ ok ]" fi [ "${dump_stats}" = 1 ] && dump_stats + + echo "$extra_msg" } chk_fail_nr() { local fail_tx=$1 local fail_rx=$2 + local ns_invert=${3:-""} local count local dump_stats + local ns_tx=$ns1 + local ns_rx=$ns2 + local extra_msg="" + local allow_tx_lost=0 + local allow_rx_lost=0 + + if [[ $ns_invert = "invert" ]]; then + ns_tx=$ns2 + ns_rx=$ns1 + extra_msg=" invert" + fi + + if [[ "${fail_tx}" = "-"* ]]; then + allow_tx_lost=1 + fail_tx=${fail_tx:1} + fi + if [[ "${fail_rx}" = "-"* ]]; then + allow_rx_lost=1 + fail_rx=${fail_rx:1} + fi printf "%-${nr_blank}s %s" " " "ftx" - count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}') + count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPFailTx | awk '{print $2}') [ -z "$count" ] && count=0 if [ "$count" != "$fail_tx" ]; then + extra_msg="$extra_msg,tx=$count" + fi + if { [ "$count" != "$fail_tx" ] && [ $allow_tx_lost -eq 0 ]; } || + { [ "$count" -gt "$fail_tx" ] && [ $allow_tx_lost -eq 1 ]; }; then echo "[fail] got $count MP_FAIL[s] TX expected $fail_tx" fail_test dump_stats=1 @@ -1017,17 +1103,23 @@ chk_fail_nr() fi echo -n " - failrx" - count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}') + count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPFailRx | awk '{print $2}') [ -z "$count" ] && count=0 if [ "$count" != "$fail_rx" ]; then + extra_msg="$extra_msg,rx=$count" + fi + if { [ "$count" != "$fail_rx" ] && [ $allow_rx_lost -eq 0 ]; } || + { [ "$count" -gt "$fail_rx" ] && [ $allow_rx_lost -eq 1 ]; }; then echo "[fail] got $count MP_FAIL[s] RX expected $fail_rx" fail_test dump_stats=1 else - echo "[ ok ]" + echo -n "[ ok ]" fi [ "${dump_stats}" = 1 ] && dump_stats + + echo "$extra_msg" } chk_fclose_nr() @@ -1199,7 +1291,7 @@ chk_join_nr() echo "[ ok ]" fi [ "${dump_stats}" = 1 ] && dump_stats - if [ $checksum -eq 1 ]; then + if [ $validate_checksum -eq 1 ]; then chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr chk_rst_nr $rst_nr $rst_nr @@ -2590,6 +2682,22 @@ fastclose_tests() fi } +pedit_action_pkts() +{ + tc -n $ns2 -j -s action show action pedit index 100 | \ + sed 's/.*"packets":\([0-9]\+\),.*/\1/' +} + +fail_tests() +{ + # single subflow + if reset_with_fail "Infinite map" 1; then + run_tests $ns1 $ns2 10.0.1.1 128 + chk_join_nr 0 0 0 +1 +0 1 0 1 "$(pedit_action_pkts)" + chk_fail_nr 1 -1 invert + fi +} + implicit_tests() { # userspace pm type prevents add_addr @@ -2658,6 +2766,7 @@ all_tests_sorted=( d@deny_join_id0_tests m@fullmesh_tests z@fastclose_tests + F@fail_tests I@implicit_tests ) |