diff options
Diffstat (limited to '')
-rw-r--r-- | net/mptcp/Makefile | 4 | ||||
-rw-r--r-- | net/mptcp/bpf.c | 21 | ||||
-rw-r--r-- | net/mptcp/ctrl.c | 21 | ||||
-rw-r--r-- | net/mptcp/mib.c | 11 | ||||
-rw-r--r-- | net/mptcp/mib.h | 13 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 108 | ||||
-rw-r--r-- | net/mptcp/options.c | 298 | ||||
-rw-r--r-- | net/mptcp/pm.c | 152 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 816 | ||||
-rw-r--r-- | net/mptcp/pm_userspace.c | 454 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 1045 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 271 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 305 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 363 | ||||
-rw-r--r-- | net/mptcp/token.c | 1 |
15 files changed, 2913 insertions, 970 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index e54daceac58b..6e7df47c9584 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o + mib.o pm_netlink.o sockopt.o pm_userspace.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o @@ -10,3 +10,5 @@ obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o + +obj-$(CONFIG_BPF_SYSCALL) += bpf.o diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c new file mode 100644 index 000000000000..5a0a84ad94af --- /dev/null +++ b/net/mptcp/bpf.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Tessares SA. + * Copyright (c) 2022, SUSE. + * + * Author: Nicolas Rybowski <nicolas.rybowski@tessares.net> + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/bpf.h> +#include "protocol.h" + +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) +{ + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + return mptcp_sk(mptcp_subflow_ctx(sk)->conn); + + return NULL; +} diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 8b235468c88f..ae20b7d92e28 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -16,6 +16,11 @@ #define MPTCP_SYSCTL_PATH "net/mptcp" static int mptcp_pernet_id; + +#ifdef CONFIG_SYSCTL +static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; +#endif + struct mptcp_pernet { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; @@ -26,6 +31,7 @@ struct mptcp_pernet { u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; + u8 pm_type; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -58,6 +64,11 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net) return mptcp_get_pernet(net)->stale_loss_cnt; } +int mptcp_get_pm_type(const struct net *net) +{ + return mptcp_get_pernet(net)->pm_type; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; @@ -65,6 +76,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; + pernet->pm_type = MPTCP_PM_TYPE_KERNEL; } #ifdef CONFIG_SYSCTL @@ -108,6 +120,14 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_douintvec_minmax, }, + { + .procname = "pm_type", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &mptcp_pm_type_max + }, {} }; @@ -128,6 +148,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; + table[5].data = &pernet->pm_type; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3240b72271a7..0dac2863c6e1 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -24,6 +24,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), @@ -35,20 +36,30 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), + SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), + SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), + SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), + SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), + SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), + SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), + SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), + SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), + SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), + SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index ecd3d8b117e0..2be3596374f4 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -17,6 +17,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ @@ -28,20 +29,32 @@ enum linux_mptcp_mib_field { MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */ MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */ + MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */ MPTCP_MIB_RMADDR, /* Received RM_ADDR */ + MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */ MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */ MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */ + MPTCP_MIB_MPFASTCLOSETX, /* Transmit a MP_FASTCLOSE */ + MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ + MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ + MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ + MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to + * conflict with another subflow while updating msk rcv wnd + */ + MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f44125dd6697..8df1bdb647e2 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -66,20 +66,106 @@ out_nosk: return err; } +struct mptcp_diag_ctx { + long s_slot; + long s_num; + unsigned int l_slot; + unsigned int l_num; +}; + +static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + bool net_admin) +{ + struct inet_diag_dump_data *cb_data = cb->data; + struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; + struct nlattr *bc = cb_data->inet_diag_nla_bc; + struct net *net = sock_net(skb->sk); + struct inet_hashinfo *hinfo; + int i; + + hinfo = net->ipv4.tcp_death_row.hashinfo; + + for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + struct sock *sk; + int num = 0; + + ilb = &hinfo->lhash2[i]; + + rcu_read_lock(); + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + const struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct inet_sock *inet = inet_sk(sk); + int ret; + + if (num < diag_ctx->l_num) + goto next_listen; + + if (!ctx || strcmp(inet_csk(sk)->icsk_ulp_ops->name, "mptcp")) + goto next_listen; + + sk = ctx->conn; + if (!sk || !net_eq(sock_net(sk), net)) + goto next_listen; + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_listen; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + + sock_put(sk); + + if (ret < 0) { + spin_unlock(&ilb->lock); + rcu_read_unlock(); + diag_ctx->l_slot = i; + diag_ctx->l_num = num; + return; + } + diag_ctx->l_num = num + 1; + num = 0; +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + rcu_read_unlock(); + + cond_resched(); + diag_ctx->l_num = 0; + } + + diag_ctx->l_num = 0; + diag_ctx->l_slot = i; +} + static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; struct nlattr *bc; + BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); + cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != - NULL) { + while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, + &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; struct sock *sk = (struct sock *)msk; int ret = 0; @@ -101,11 +187,14 @@ next: sock_put(sk); if (ret < 0) { /* will retry on the same position */ - cb->args[1]--; + diag_ctx->s_num--; break; } cond_resched(); } + + if ((r->idiag_states & TCPF_LISTEN) && r->id.idiag_dport == 0) + mptcp_diag_dump_listeners(skb, cb, r, net_admin); } static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -116,6 +205,19 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); + + if (inet_sk_state_load(sk) == TCP_LISTEN) { + struct sock *lsk = READ_ONCE(msk->first); + + if (lsk) { + /* override with settings from tcp listener, + * so Send-Q will show accept queue. + */ + r->idiag_rqueue = READ_ONCE(lsk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(lsk->sk_max_ack_backlog); + } + } + if (!info) return; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index fe98e4f475ba..30d289044e71 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -107,7 +107,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { - mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } @@ -221,7 +221,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; - mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } @@ -323,6 +323,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; + pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key); break; case MPTCPOPT_RST: @@ -336,6 +337,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; + pr_debug("MP_RST: transient=%u reason=%u", + mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: @@ -353,8 +356,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, } } -void mptcp_get_options(const struct sock *sk, - const struct sk_buff *skb, +void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); @@ -651,7 +653,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * bool drop_other_suboptions = false; unsigned int opt_size = *size; bool echo; - bool port; int len; /* add addr will strip the existing options, be sure to avoid breaking @@ -660,12 +661,12 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, - &echo, &port, &drop_other_suboptions)) + &echo, &drop_other_suboptions)) return false; if (drop_other_suboptions) remaining += opt_size; - len = mptcp_add_addr_len(opts->addr.family, echo, port); + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; @@ -764,10 +765,34 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } +static bool mptcp_established_options_fastclose(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + if (likely(!subflow->send_fastclose)) + return false; + + if (remaining < TCPOLEN_MPTCP_FASTCLOSE) + return false; + + *size = TCPOLEN_MPTCP_FASTCLOSE; + opts->suboptions |= OPTION_MPTCP_FASTCLOSE; + opts->rcvr_key = msk->remote_key; + + pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); + return true; +} + static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, @@ -786,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk, opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } @@ -802,14 +828,16 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; - if (unlikely(__mptcp_check_fallback(msk))) + if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { - if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { + if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || + mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } + /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; @@ -821,10 +849,13 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) { + unsigned int mp_fail_size; + ret = true; - if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { - *size += opt_size; - remaining -= opt_size; + if (mptcp_established_options_mp_fail(sk, &mp_fail_size, + remaining - opt_size, opts)) { + *size += opt_size + mp_fail_size; + remaining -= opt_size - mp_fail_size; return true; } } @@ -901,7 +932,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && - READ_ONCE(msk->pm.server_side)) + !subflow->request_join) tcp_send_ack(ssk); goto fully_established; } @@ -936,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto reset; subflow->mp_capable = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); return false; } @@ -1057,8 +1088,7 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", - msk, (unsigned long long)hmac, - (unsigned long long)mp_opt->ahmac); + msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } @@ -1085,7 +1115,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. @@ -1098,12 +1128,13 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) msk->local_key == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { - mptcp_pm_add_addr_received(msk, &mp_opt.addr); + mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); @@ -1132,6 +1163,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) @@ -1193,23 +1225,65 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } -static void mptcp_set_rwin(const struct tcp_sock *tp) +static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; - const struct mptcp_subflow_context *subflow; + struct mptcp_subflow_context *subflow; + u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; - u64 ack_seq; + u32 new_win; + u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); - ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + ack_seq = READ_ONCE(msk->ack_seq); + rcv_wnd_new = ack_seq + tp->rcv_wnd; - if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); + if (after64(rcv_wnd_new, rcv_wnd_old)) { + u64 rcv_wnd; + + for (;;) { + rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); + + if (rcv_wnd == rcv_wnd_old) + break; + if (before64(rcv_wnd_new, rcv_wnd)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); + goto raise_win; + } + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); + rcv_wnd_old = rcv_wnd; + } + return; + } + + if (rcv_wnd_new != rcv_wnd_old) { +raise_win: + win = rcv_wnd_old - ack_seq; + tp->rcv_wnd = min_t(u64, win, U32_MAX); + new_win = tp->rcv_wnd; + + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (unlikely(th->syn)) + new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; + if (!tp->rx_opt.rcv_wscale && + READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + th->window = htons(new_win); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); + } } -static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum) +__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; @@ -1224,44 +1298,53 @@ static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum1 header.data_len = htons(data_len); header.csum = 0; - csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum)); - return (__force u16)csum_fold(csum); + csum = csum_partial(&header, sizeof(header), sum); + return csum_fold(csum); } -static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, - mpext->csum); + ~csum_unfold(mpext->csum)); } -void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, - struct mptcp_out_options *opts) +static void put_len_csum(u16 len, __sum16 csum, void *data) { - if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { - const struct sock *ssk = (const struct sock *)tp; - struct mptcp_subflow_context *subflow; + __sum16 *sumptr = data + 2; + __be16 *ptr = data; - subflow = mptcp_subflow_ctx(ssk); - subflow->send_mp_fail = 0; + put_unaligned_be16(len, ptr); - *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, - TCPOLEN_MPTCP_FAIL, - 0, 0); - put_unaligned_be64(opts->fail_seq, ptr); - ptr += 2; - } - - /* RST is mutually exclusive with everything else */ - if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { - *ptr++ = mptcp_option(MPTCPOPT_RST, - TCPOLEN_MPTCP_RST, - opts->reset_transient, - opts->reset_reason); - return; - } + put_unaligned(csum, sumptr); +} - /* DSS, MPC, MPJ and ADD_ADDR are mutually exclusive, see - * mptcp_established_options*() +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, + struct mptcp_out_options *opts) +{ + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + /* Which options can be used together? + * + * X: mutually exclusive + * O: often used together + * C: can be used together in some cases + * P: could be used together but we prefer not to (optimisations) + * + * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | + * ------|------|------|------|------|------|------|------|------| + * MPC |------|------|------|------|------|------|------|------| + * MPJ | X |------|------|------|------|------|------|------| + * DSS | X | X |------|------|------|------|------|------| + * ADD | X | X | P |------|------|------|------|------| + * RM | C | C | C | P |------|------|------|------| + * PRIO | X | C | C | C | C |------|------|------| + * FAIL | X | X | C | X | X | X |------|------| + * FC | X | X | X | X | X | X | X |------| + * RST | X | X | X | X | X | X | O | O | + * ------|------|------|------|------|------|------|------|------| + * + * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; @@ -1310,13 +1393,22 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { - put_unaligned_be32(mpext->data_len << 16 | - mptcp_make_csum(mpext), ptr); + /* data_len == 0 is reserved for the infinite mapping, + * the checksum will also be set to 0. + */ + put_len_csum(mpext->data_len, + (mpext->data_len ? mptcp_make_csum(mpext) : 0), + ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } + ptr += 1; } + + /* We might need to add MP_FAIL options in rare cases */ + if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) + goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; @@ -1357,11 +1449,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, goto mp_capable_done; if (opts->csum_reqd) { - put_unaligned_be32(opts->data_len << 16 | - __mptcp_make_csum(opts->data_seq, - opts->subflow_seq, - opts->data_len, - opts->csum), ptr); + put_len_csum(opts->data_len, + __mptcp_make_csum(opts->data_seq, + opts->subflow_seq, + opts->data_len, + ~csum_unfold(opts->csum)), + ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); @@ -1370,27 +1463,29 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; - } else if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_SYN, - opts->backup, opts->join_id); - put_unaligned_be32(opts->token, ptr); - ptr += 1; - put_unaligned_be32(opts->nonce, ptr); - ptr += 1; - } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_SYNACK, - opts->backup, opts->join_id); - put_unaligned_be64(opts->thmac, ptr); - ptr += 2; - put_unaligned_be32(opts->nonce, ptr); - ptr += 1; - } else if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_ACK, 0, 0); - memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); - ptr += 5; + } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { + if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYN, + opts->backup, opts->join_id); + put_unaligned_be32(opts->token, ptr); + ptr += 1; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYNACK, + opts->backup, opts->join_id); + put_unaligned_be64(opts->thmac, ptr); + ptr += 2; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } else { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_ACK, 0, 0); + memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); + ptr += 5; + } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; @@ -1447,18 +1542,51 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, ptr += 1; } } + } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { + /* FASTCLOSE is mutually exclusive with others except RST */ + *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, + TCPOLEN_MPTCP_FASTCLOSE, + 0, 0); + put_unaligned_be64(opts->rcvr_key, ptr); + ptr += 2; + + if (OPTION_MPTCP_RST & opts->suboptions) + goto mp_rst; + return; + } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { +mp_fail: + /* MP_FAIL is mutually exclusive with others except RST */ + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_fail = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, + TCPOLEN_MPTCP_FAIL, + 0, 0); + put_unaligned_be64(opts->fail_seq, ptr); + ptr += 2; + + if (OPTION_MPTCP_RST & opts->suboptions) + goto mp_rst; + return; + } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { +mp_rst: + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { - const struct sock *ssk = (const struct sock *)tp; - struct mptcp_subflow_context *subflow; - subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); + + MPTCP_INC_STATS(sock_net((const struct sock *)tp), + MPTCP_MIB_MPPRIOTX); } mp_capable_done: @@ -1483,7 +1611,7 @@ mp_capable_done: } if (tp) - mptcp_set_rwin(tp); + mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6ab386ff3294..45e2a48397b9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -87,6 +87,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) unsigned int subflows_max; int ret = 0; + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_active(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, @@ -172,35 +175,65 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk) spin_unlock_bh(&pm->lock); } -void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) +void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, + const struct mptcp_subflow_context *subflow) { - pr_debug("msk=%p", msk); + struct mptcp_pm_data *pm = &msk->pm; + bool update_subflows; + + update_subflows = (subflow->request_join || subflow->mp_join) && + mptcp_pm_is_kernel(msk); + if (!READ_ONCE(pm->work_pending) && !update_subflows) + return; + + spin_lock_bh(&pm->lock); + if (update_subflows) + __mptcp_pm_close_subflow(msk); + + /* Even if this subflow is not really established, tell the PM to try + * to pick the next ones, if possible. + */ + if (mptcp_pm_nl_check_work_pending(msk)) + mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); + + spin_unlock_bh(&pm->lock); } -void mptcp_pm_add_addr_received(struct mptcp_sock *msk, +void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, READ_ONCE(pm->accept_addr)); - mptcp_event_addr_announced(msk, addr); + mptcp_event_addr_announced(ssk, addr); spin_lock_bh(&pm->lock); - if (!READ_ONCE(pm->accept_addr)) { + if (mptcp_pm_is_userspace(msk)) { + if (mptcp_userspace_pm_active(msk)) { + mptcp_pm_announce_addr(msk, addr, true); + mptcp_pm_add_addr_send_ack(msk); + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); + } + } else if (!READ_ONCE(pm->accept_addr)) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) + const struct mptcp_addr_info *addr) { struct mptcp_pm_data *pm = &msk->pm; @@ -234,36 +267,67 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); - mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_list_rx = *rm_list; + if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) + pm->rm_list_rx = *rm_list; + else + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } -void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) +void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); - subflow->backup = bkup; + msk = mptcp_sk(sk); + if (subflow->backup != bkup) { + subflow->backup = bkup; + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + msk->last_snd = NULL; + else + __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); + mptcp_data_unlock(sk); + } - mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); + mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + pr_debug("fail_seq=%llu", fail_seq); + + if (!READ_ONCE(msk->allow_infinite_fallback)) + return; + + if (!subflow->fail_tout) { + pr_debug("send MP_FAIL response and infinite map"); + + subflow->send_mp_fail = 1; + subflow->send_infinite_map = 1; + tcp_send_ack(sk); + } else { + pr_debug("MP_FAIL response received"); + WRITE_ONCE(subflow->fail_tout, 0); + } } /* path manager helpers */ -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, - bool *port, bool *drop_other_suboptions) + bool *drop_other_suboptions) { int ret = false; u8 add_addr; u8 family; + bool port; spin_lock_bh(&msk->pm.lock); @@ -281,10 +345,10 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, } *echo = mptcp_pm_should_add_signal_echo(msk); - *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); + port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); family = *echo ? msk->pm.remote.family : msk->pm.local.family; - if (remaining < mptcp_add_addr_len(family, *echo, *port)) + if (remaining < mptcp_add_addr_len(family, *echo, port)) goto out_unlock; if (*echo) { @@ -356,25 +420,51 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } } -void mptcp_pm_data_init(struct mptcp_sock *msk) +void mptcp_pm_data_reset(struct mptcp_sock *msk) { - msk->pm.add_addr_signaled = 0; - msk->pm.add_addr_accepted = 0; - msk->pm.local_addr_used = 0; - msk->pm.subflows = 0; - msk->pm.rm_list_tx.nr = 0; - msk->pm.rm_list_rx.nr = 0; - WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.addr_signal, 0); - WRITE_ONCE(msk->pm.accept_addr, false); - WRITE_ONCE(msk->pm.accept_subflow, false); - WRITE_ONCE(msk->pm.remote_deny_join_id0, false); - msk->pm.status = 0; + u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); + struct mptcp_pm_data *pm = &msk->pm; + pm->add_addr_signaled = 0; + pm->add_addr_accepted = 0; + pm->local_addr_used = 0; + pm->subflows = 0; + pm->rm_list_tx.nr = 0; + pm->rm_list_rx.nr = 0; + WRITE_ONCE(pm->pm_type, pm_type); + + if (pm_type == MPTCP_PM_TYPE_KERNEL) { + bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + + /* pm->work_pending must be only be set to 'true' when + * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL + */ + WRITE_ONCE(pm->work_pending, + (!!mptcp_pm_get_local_addr_max(msk) && + subflows_allowed) || + !!mptcp_pm_get_add_addr_signal_max(msk)); + WRITE_ONCE(pm->accept_addr, + !!mptcp_pm_get_add_addr_accept_max(msk) && + subflows_allowed); + WRITE_ONCE(pm->accept_subflow, subflows_allowed); + } else { + WRITE_ONCE(pm->work_pending, 0); + WRITE_ONCE(pm->accept_addr, 0); + WRITE_ONCE(pm->accept_subflow, 0); + } + + WRITE_ONCE(pm->addr_signal, 0); + WRITE_ONCE(pm->remote_deny_join_id0, false); + pm->status = 0; + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); +} + +void mptcp_pm_data_init(struct mptcp_sock *msk) +{ spin_lock_init(&msk->pm.lock); INIT_LIST_HEAD(&msk->pm.anno_list); - - mptcp_pm_nl_data_init(msk); + INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); + mptcp_pm_data_reset(msk); } void __init mptcp_pm_init(void) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7b96be1e9f14..9813ed0fde9b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -22,14 +22,6 @@ static struct genl_family mptcp_genl_family; static int pm_nl_pernet_id; -struct mptcp_pm_addr_entry { - struct list_head list; - struct mptcp_addr_info addr; - u8 flags; - int ifindex; - struct socket *lsk; -}; - struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; @@ -38,9 +30,6 @@ struct mptcp_pm_add_entry { u8 retrans_times; }; -#define MAX_ADDR_ID 255 -#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG) - struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; @@ -52,14 +41,25 @@ struct pm_nl_pernet { unsigned int local_addr_max; unsigned int subflows_max; unsigned int next_id; - unsigned long id_bitmap[BITMAP_SZ]; + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 -static bool addresses_equal(const struct mptcp_addr_info *a, - struct mptcp_addr_info *b, bool use_port) +static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) +{ + return net_generic(net, pm_nl_pernet_id); +} + +static struct pm_nl_pernet * +pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) +{ + return pm_nl_get_pernet(sock_net((struct sock *)msk)); +} + +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; @@ -86,16 +86,6 @@ static bool addresses_equal(const struct mptcp_addr_info *a, return a->port == b->port; } -static bool address_zero(const struct mptcp_addr_info *addr) -{ - struct mptcp_addr_info zero; - - memset(&zero, 0, sizeof(zero)); - zero.family = addr->family; - - return addresses_equal(addr, &zero, true); -} - static void local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { @@ -123,7 +113,7 @@ static void remote_address(const struct sock_common *skc, } static bool lookup_subflow_by_saddr(const struct list_head *list, - struct mptcp_addr_info *saddr) + const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; @@ -133,7 +123,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); local_address(skc, &cur); - if (addresses_equal(&cur, saddr, saddr->port)) + if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } @@ -141,7 +131,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, } static bool lookup_subflow_by_daddr(const struct list_head *list, - struct mptcp_addr_info *daddr) + const struct mptcp_addr_info *daddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; @@ -151,7 +141,7 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); remote_address(skc, &cur); - if (addresses_equal(&cur, daddr, daddr->port)) + if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } @@ -160,19 +150,21 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, - struct mptcp_sock *msk) + const struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; struct mptcp_pm_addr_entry *entry, *ret = NULL; - struct sock *sk = (struct sock *)msk; msk_owned_by_me(msk); rcu_read_lock(); - __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + if (entry->addr.family != sk->sk_family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if ((entry->addr.family == AF_INET && @@ -183,23 +175,17 @@ select_local_address(const struct pm_nl_pernet *pernet, continue; } - /* avoid any address already in use by subflows and - * pending join - */ - if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { - ret = entry; - break; - } + ret = entry; + break; } rcu_read_unlock(); return ret; } static struct mptcp_pm_addr_entry * -select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *ret = NULL; - int i = 0; rcu_read_lock(); /* do not keep any additional per socket state, just signal @@ -208,71 +194,74 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; - if (i++ == pos) { - ret = entry; - break; - } + + ret = entry; + break; } rcu_read_unlock(); return ret; } -unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->add_addr_signal_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->add_addr_accept_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); -unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->subflows_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); -unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk) +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->local_addr_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); -static void check_work_pending(struct mptcp_sock *msk) +bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { - if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) && - (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) || - msk->pm.subflows == mptcp_pm_get_subflows_max(msk))) + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); + return false; + } + return true; } struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, addr, true)) + if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } @@ -289,7 +278,7 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, &saddr, true)) { + if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } @@ -346,7 +335,7 @@ out: struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, bool check_id) + const struct mptcp_addr_info *addr, bool check_id) { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; @@ -363,8 +352,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, return entry; } -static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry) +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_pm_addr_entry *entry) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; @@ -372,8 +361,16 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); - if (mptcp_lookup_anno_list_by_saddr(msk, &entry->addr)) - return false; + add_entry = mptcp_lookup_anno_list_by_saddr(msk, &entry->addr); + + if (add_entry) { + if (mptcp_pm_is_kernel(msk)) + return false; + + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + return true; + } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); if (!add_entry) @@ -410,13 +407,13 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) } } -static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr, - struct mptcp_addr_info *addr) +static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr, + const struct mptcp_addr_info *addr) { int i; for (i = 0; i < nr; i++) { - if (addresses_equal(&addrs[i], addr, addr->port)) + if (addrs[i].id == addr->id) return true; } @@ -429,6 +426,7 @@ static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, struct mptcp_addr_info *addrs) { + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; struct mptcp_addr_info remote = { 0 }; @@ -436,22 +434,29 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm int i = 0; subflows_max = mptcp_pm_get_subflows_max(msk); + remote_address((struct sock_common *)sk, &remote); /* Non-fullmesh endpoint, fill in the single entry * corresponding to the primary MPC subflow remote address */ if (!fullmesh) { - remote_address((struct sock_common *)sk, &remote); + if (deny_id0) + return 0; + msk->pm.subflows++; addrs[i++] = remote; } else { mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); - remote_address((struct sock_common *)ssk, &remote); - if (!lookup_address_in_vec(addrs, i, &remote) && + remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = subflow->remote_id; + if (deny_id0 && !addrs[i].id) + continue; + + if (!lookup_address_in_vec(addrs, i, &addrs[i]) && msk->pm.subflows < subflows_max) { msk->pm.subflows++; - addrs[i++] = remote; + i++; } } } @@ -459,6 +464,64 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm return i; } +static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + pr_debug("send ack for %s", + prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); + + slow = lock_sock_fast(ssk); + if (prio) { + if (subflow->backup != backup) + msk->last_snd = NULL; + + subflow->send_mp_prio = 1; + subflow->backup = backup; + subflow->request_bkup = backup; + } + + __mptcp_subflow_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + +static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + spin_unlock_bh(&msk->pm.lock); + __mptcp_pm_send_ack(msk, subflow, prio, backup); + spin_lock_bh(&msk->pm.lock); +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, + bool lookup_by_id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if ((!lookup_by_id && + mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) || + (lookup_by_id && entry->addr.id == info->id)) + return entry; + } + return NULL; +} + static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; @@ -468,12 +531,35 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct pm_nl_pernet *pernet; unsigned int subflows_max; - pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); + /* do lazy endpoint usage accounting for the MPC subflows */ + if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + bool backup = false; + + local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr, false); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + } + rcu_read_unlock(); + + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); + } + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, @@ -481,47 +567,51 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { - local = select_signal_address(pernet, - msk->pm.add_addr_signaled); + local = select_signal_address(pernet, msk); + + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); mptcp_pm_nl_addr_send_ack(msk); } - } else { - /* pick failed, avoid fourther attempts later */ - msk->pm.local_addr_used = add_addr_signal_max; } - - check_work_pending(msk); } /* check if should create a new subflow */ - if (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max && - !READ_ONCE(msk->pm.remote_deny_join_id0)) { + while (msk->pm.local_addr_used < local_addr_max && + msk->pm.subflows < subflows_max) { + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + bool fullmesh; + int i, nr; + local = select_local_address(pernet, msk); - if (local) { - bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; - int i, nr; + if (!local) + break; - msk->pm.local_addr_used++; - check_work_pending(msk); - nr = fill_remote_addresses_vec(msk, fullmesh, addrs); - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); - spin_lock_bh(&msk->pm.lock); - return; - } + fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - /* lookup failed, avoid fourther attempts later */ - msk->pm.local_addr_used = local_addr_max; - check_work_pending(msk); + msk->pm.local_addr_used++; + nr = fill_remote_addresses_vec(msk, fullmesh, addrs); + if (nr) + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); + spin_lock_bh(&msk->pm.lock); } + mptcp_pm_nl_check_work_pending(msk); } static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) @@ -547,11 +637,10 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, unsigned int subflows_max; int i = 0; - pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); rcu_read_lock(); - __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; @@ -603,15 +692,20 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); - if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) - goto add_addr_echo; + remote = msk->pm.remote; + mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_nl_addr_send_ack(msk); + + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) + return; + + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) + remote.port = sk->sk_dport; /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - remote = msk->pm.remote; - if (!remote.port) - remote.port = sk->sk_dport; nr = fill_local_addresses_vec(msk, addrs); msk->pm.add_addr_accepted++; @@ -623,10 +717,6 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); - -add_addr_echo: - mptcp_pm_announce_addr(msk, &msk->pm.remote, true); - mptcp_pm_nl_addr_send_ack(msk); } void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) @@ -640,23 +730,15 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) !mptcp_pm_should_rm_signal(msk)) return; - __mptcp_flush_join_list(msk); subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); - if (subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for %s", - mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"); - - mptcp_subflow_send_ack(ssk); - spin_lock_bh(&msk->pm.lock); - } + if (subflow) + mptcp_pm_send_ack(msk, subflow, false, false); } -static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup) +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -664,29 +746,30 @@ static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info local; + struct mptcp_addr_info local, remote; local_address((struct sock_common *)ssk, &local); - if (!addresses_equal(&local, addr, addr->port)) + if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; - subflow->backup = bkup; - subflow->send_mp_prio = 1; - subflow->request_bkup = bkup; - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX); - - spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for mp_prio"); - mptcp_subflow_send_ack(ssk); - spin_lock_bh(&msk->pm.lock); + if (rem && rem->family != AF_UNSPEC) { + remote_address((struct sock_common *)ssk, &remote); + if (!mptcp_addresses_equal(&remote, rem, rem->port)) + continue; + } + __mptcp_pm_send_ack(msk, subflow, true, bkup); return 0; } return -EINVAL; } +static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id) +{ + return local_id == id || (!local_id && msk->mpc_endpoint_id == id); +} + static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) @@ -700,6 +783,9 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, msk_owned_by_me(msk); + if (sk->sk_state == TCP_LISTEN) + return; + if (!rm_list->nr) return; @@ -707,34 +793,47 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, return; for (i = 0; i < rm_list->nr; i++) { - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + u8 rm_id = rm_list->ids[i]; + bool removed = false; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow->local_id; - if (rm_type == MPTCP_MIB_RMADDR) - id = subflow->remote_id; - - if (rm_list->ids[i] != id) + if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id) + continue; + if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) continue; - pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u", + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_list->ids[i], subflow->local_id, subflow->remote_id); + i, rm_id, subflow->local_id, subflow->remote_id, + msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); + + /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); - if (rm_type == MPTCP_MIB_RMADDR) { - msk->pm.add_addr_accepted--; - WRITE_ONCE(msk->pm.accept_addr, true); - } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { - msk->pm.local_addr_used--; - } - msk->pm.subflows--; + removed = true; __MPTCP_INC_STATS(sock_net(sk), rm_type); } + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap); + if (!removed) + continue; + + if (!mptcp_pm_is_kernel(msk)) + continue; + + if (rm_type == MPTCP_MIB_RMADDR) { + msk->pm.add_addr_accepted--; + WRITE_ONCE(msk->pm.accept_addr, true); + } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { + msk->pm.local_addr_used--; + } } } @@ -755,6 +854,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) msk_owned_by_me(msk); + if (!(pm->status & MPTCP_PM_WORK_MASK)) + return; + spin_lock_bh(&msk->pm.lock); pr_debug("msk=%p status=%x", msk, pm->status); @@ -789,10 +891,18 @@ static bool address_use_port(struct mptcp_pm_addr_entry *entry) MPTCP_PM_ADDR_FLAG_SIGNAL; } +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); +} + static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry) { - struct mptcp_pm_addr_entry *cur; + struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; int ret = -EINVAL; @@ -800,7 +910,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, /* to keep the code simple, don't do IDR-like allocation for address ID, * just bail when we exceed limits */ - if (pernet->next_id == MAX_ADDR_ID) + if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; if (pernet->addrs >= MPTCP_PM_ADDR_MAX) goto out; @@ -810,26 +920,40 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, /* do not insert duplicate address, differentiate on port only * singled addresses */ + if (!address_use_port(entry)) + entry->addr.port = 0; list_for_each_entry(cur, &pernet->local_addr_list, list) { - if (addresses_equal(&cur->addr, &entry->addr, - address_use_port(entry) && - address_use_port(cur))) - goto out; + if (mptcp_addresses_equal(&cur->addr, &entry->addr, + cur->addr.port || entry->addr.port)) { + /* allow replacing the exiting endpoint only if such + * endpoint is an implicit one and the user-space + * did not provide an endpoint id + */ + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) + goto out; + if (entry->addr.id) + goto out; + + pernet->addrs--; + entry->addr.id = cur->addr.id; + list_del_rcu(&cur->list); + del_entry = cur; + break; + } } if (!entry->addr.id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, - MAX_ADDR_ID + 1, + MPTCP_PM_MAX_ADDR_ID + 1, pernet->next_id); - if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) && - pernet->next_id != 1) { + if (!entry->addr.id && pernet->next_id != 1) { pernet->next_id = 1; goto find_next; } } - if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID) + if (!entry->addr.id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); @@ -846,17 +970,27 @@ find_next: } pernet->addrs++; - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + if (!entry->addr.port) + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + else + list_add_rcu(&entry->list, &pernet->local_addr_list); ret = entry->addr.id; out: spin_unlock_bh(&pernet->lock); + + /* just replaced an existing entry, free it */ + if (del_entry) { + synchronize_rcu(); + __mptcp_pm_release_addr_entry(del_entry); + } return ret; } static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { + int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct mptcp_sock *msk; struct socket *ssock; @@ -881,8 +1015,11 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, } mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); - err = kernel_bind(ssock, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in)); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); if (err) { pr_warn("kernel_bind error, err=%d", err); goto out; @@ -917,17 +1054,17 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) */ local_address((struct sock_common *)msk, &msk_local); local_address((struct sock_common *)skc, &skc_local); - if (addresses_equal(&msk_local, &skc_local, false)) + if (mptcp_addresses_equal(&msk_local, &skc_local, false)) return 0; - if (address_zero(&skc_local)) - return 0; + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_local_id(msk, &skc_local); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { + if (mptcp_addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { ret = entry->addr.id; break; } @@ -945,7 +1082,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; - entry->flags = 0; + entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) @@ -954,18 +1091,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return ret; } -void mptcp_pm_nl_data_init(struct mptcp_sock *msk) -{ - struct mptcp_pm_data *pm = &msk->pm; - bool subflows; - - subflows = !!mptcp_pm_get_subflows_max(msk); - WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) || - !!mptcp_pm_get_add_addr_signal_max(msk)); - WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows); - WRITE_ONCE(pm->accept_subflow, subflows); -} - #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 @@ -993,6 +1118,10 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { NLA_POLICY_NESTED(mptcp_pm_addr_policy), [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ATTR_ADDR_REMOTE] = + NLA_POLICY_NESTED(mptcp_pm_addr_policy), }; void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) @@ -1022,7 +1151,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss } unlock_sock_fast(ssk, slow); - /* always try to push the pending data regarless of re-injections: + /* always try to push the pending data regardless of re-injections: * we can possibly use backup subflows now, and subflow selection * is cheap under the msk socket lock */ @@ -1041,11 +1170,12 @@ static int mptcp_pm_family_to_addr(int family) return MPTCP_PM_ADDR_ATTR_ADDR4; } -static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, - bool require_family, - struct mptcp_pm_addr_entry *entry) +static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], + const struct nlattr *attr, + struct genl_info *info, + struct mptcp_addr_info *addr, + bool require_family) { - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err, addr_addr; if (!attr) { @@ -1059,27 +1189,29 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, if (err) return err; - memset(entry, 0, sizeof(*entry)); + if (tb[MPTCP_PM_ADDR_ATTR_ID]) + addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) - goto skip_family; + return err; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } - entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); - if (entry->addr.family != AF_INET + addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) - && entry->addr.family != AF_INET6 + && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } - addr_addr = mptcp_pm_family_to_addr(entry->addr.family); + addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); @@ -1087,40 +1219,59 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (entry->addr.family == AF_INET6) - entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]); + if (addr->family == AF_INET6) + addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif - entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); + addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); + + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + + return err; +} + +int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + struct mptcp_addr_info *addr) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + memset(addr, 0, sizeof(*addr)); + + return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); +} + +int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + int err; + + memset(entry, 0, sizeof(*entry)); + + err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); + if (err) + return err; -skip_family: if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } - if (tb[MPTCP_PM_ADDR_ATTR_ID]) - entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); - if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); - if (tb[MPTCP_PM_ADDR_ATTR_PORT]) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - NL_SET_ERR_MSG_ATTR(info->extack, attr, - "flags must have signal when using port"); - return -EINVAL; - } + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); - } return 0; } static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) { - return net_generic(genl_info_net(info), pm_nl_pernet_id); + return pm_nl_get_pernet(genl_info_net(info)); } static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) @@ -1131,7 +1282,8 @@ static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; - if (!READ_ONCE(msk->fully_established)) + if (!READ_ONCE(msk->fully_established) || + mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); @@ -1155,11 +1307,27 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) struct mptcp_pm_addr_entry addr, *entry; int ret; - ret = mptcp_pm_parse_addr(attr, info, true, &addr); + ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + GENL_SET_ERR_MSG(info, "flags must have signal when using port"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); + return -EINVAL; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); return -ENOMEM; @@ -1188,29 +1356,25 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return 0; } -static struct mptcp_pm_addr_entry * -__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry(entry, &pernet->local_addr_list, list) { - if (entry->addr.id == id) - return entry; - } - return NULL; -} - -int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry; + struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); *flags = 0; *ifindex = 0; if (id) { + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, + id, + flags, + ifindex); + rcu_read_lock(); - entry = __lookup_addr_by_id(net_generic(net, pm_nl_pernet_id), id); + entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); if (entry) { *flags = entry->flags; *ifindex = entry->ifindex; @@ -1222,7 +1386,7 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, } static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) + const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -1237,7 +1401,7 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, } static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, + const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; @@ -1255,11 +1419,12 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, } static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, - struct mptcp_addr_info *addr) + const struct mptcp_pm_addr_entry *entry) { - struct mptcp_sock *msk; - long s_slot = 0, s_num = 0; + const struct mptcp_addr_info *addr = &entry->addr; struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; pr_debug("remove_id=%d", addr->id); @@ -1269,6 +1434,9 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, struct sock *sk = (struct sock *)msk; bool remove_subflow; + if (mptcp_pm_is_userspace(msk)) + goto next; + if (list_empty(&msk->conn_list)) { mptcp_pm_remove_anno_addr(msk, addr, false); goto next; @@ -1276,7 +1444,8 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, lock_sock(sk); remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); - mptcp_pm_remove_anno_addr(msk, addr, remove_subflow); + mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && + !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); if (remove_subflow) mptcp_pm_remove_subflow(msk, &list); release_sock(sk); @@ -1289,14 +1458,6 @@ next: return 0; } -/* caller must ensure the RCU grace period is already elapsed */ -static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); -} - static int mptcp_nl_remove_id_zero_address(struct net *net, struct mptcp_addr_info *addr) { @@ -1310,11 +1471,11 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, struct sock *sk = (struct sock *)msk; struct mptcp_addr_info msk_local; - if (list_empty(&msk->conn_list)) + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; local_address((struct sock_common *)msk, &msk_local); - if (!addresses_equal(&msk_local, addr, addr->port)) + if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; lock_sock(sk); @@ -1340,7 +1501,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) unsigned int addr_max; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1373,29 +1534,27 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); - mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr); + mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); synchronize_rcu(); __mptcp_pm_release_addr_entry(entry); return ret; } -static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) +void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX && - slist.nr < MPTCP_RM_IDS_MAX) { - alist.ids[alist.nr++] = entry->addr.id; + slist.nr < MPTCP_RM_IDS_MAX) slist.ids[slist.nr++] = entry->addr.id; - } else if (remove_anno_list_by_saddr(msk, &entry->addr) && - alist.nr < MPTCP_RM_IDS_MAX) { + + if (remove_anno_list_by_saddr(msk, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; - } } if (alist.nr) { @@ -1419,9 +1578,11 @@ static void mptcp_nl_remove_addrs_list(struct net *net, while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; - lock_sock(sk); - mptcp_pm_remove_addrs_and_subflows(msk, rm_list); - release_sock(sk); + if (!mptcp_pm_is_userspace(msk)) { + lock_sock(sk); + mptcp_pm_remove_addrs_and_subflows(msk, rm_list); + release_sock(sk); + } sock_put(sk); cond_resched(); @@ -1458,7 +1619,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); + bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); synchronize_rcu(); @@ -1514,7 +1675,7 @@ static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) void *reply; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1565,10 +1726,10 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, void *hdr; int i; - pernet = net_generic(net, pm_nl_pernet_id); + pernet = pm_nl_get_pernet(net); spin_lock_bh(&pernet->lock); - for (i = id; i < MAX_ADDR_ID + 1; i++) { + for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { if (test_bit(i, pernet->id_bitmap)) { entry = __lookup_addr_by_id(pernet, i); if (!entry) @@ -1672,9 +1833,22 @@ fail: return -EMSGSIZE; } -static int mptcp_nl_addr_backup(struct net *net, - struct mptcp_addr_info *addr, - u8 bkup) +static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + + list.ids[list.nr++] = addr->id; + + spin_lock_bh(&msk->pm.lock); + mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); +} + +static int mptcp_nl_set_flags(struct net *net, + struct mptcp_addr_info *addr, + u8 bkup, u8 changed) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; @@ -1683,13 +1857,14 @@ static int mptcp_nl_addr_backup(struct net *net, while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; - if (list_empty(&msk->conn_list)) + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup); - spin_unlock_bh(&msk->pm.lock); + if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) + ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); + if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) + mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); next: @@ -1702,31 +1877,58 @@ next: static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) { + struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry; + struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; + u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | + MPTCP_PM_ADDR_FLAG_FULLMESH; struct net *net = sock_net(skb->sk); - u8 bkup = 0; + u8 bkup = 0, lookup_by_id = 0; int ret; - ret = mptcp_pm_parse_addr(attr, info, true, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; + if (attr_rem) { + ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote); + if (ret < 0) + return ret; + } + if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; + if (addr.addr.family == AF_UNSPEC) { + lookup_by_id = 1; + if (!addr.addr.id) + return -EOPNOTSUPP; + } - list_for_each_entry(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, &addr.addr, true)) { - mptcp_nl_addr_backup(net, &entry->addr, bkup); + if (token) + return mptcp_userspace_pm_set_flags(sock_net(skb->sk), + token, &addr, &remote, bkup); - if (bkup) - entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; - else - entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; - } + spin_lock_bh(&pernet->lock); + entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); + if (!entry) { + spin_unlock_bh(&pernet->lock); + return -EINVAL; + } + if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + spin_unlock_bh(&pernet->lock); + return -EINVAL; } + changed = (addr.flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (addr.flags & mask); + addr = *entry; + spin_unlock_bh(&pernet->lock); + + mptcp_nl_set_flags(net, &addr.addr, bkup, changed); return 0; } @@ -1736,6 +1938,13 @@ static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gf nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } +bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) +{ + return genl_has_listeners(&mptcp_genl_family, + sock_net((const struct sock *)msk), + MPTCP_PM_EV_GRP_OFFSET); +} + static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); @@ -1856,6 +2065,9 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } @@ -1890,10 +2102,12 @@ nla_put_failure: kfree_skb(skb); } -void mptcp_event_addr_announced(const struct mptcp_sock *msk, +void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { - struct net *net = sock_net((const struct sock *)msk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; @@ -1915,7 +2129,10 @@ void mptcp_event_addr_announced(const struct mptcp_sock *msk, if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; - if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port)) + if (nla_put_be16(skb, MPTCP_ATTR_DPORT, + info->port == 0 ? + inet_sk(ssk)->inet_dport : + info->port)) goto nla_put_failure; switch (info->family) { @@ -2001,17 +2218,17 @@ static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_ADD_ADDR, .doit = mptcp_nl_cmd_add_addr, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_DEL_ADDR, .doit = mptcp_nl_cmd_del_addr, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, .doit = mptcp_nl_cmd_flush_addrs, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_GET_ADDR, @@ -2021,7 +2238,7 @@ static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_SET_LIMITS, .doit = mptcp_nl_cmd_set_limits, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_GET_LIMITS, @@ -2030,7 +2247,27 @@ static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_SET_FLAGS, .doit = mptcp_nl_cmd_set_flags, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_ANNOUNCE, + .doit = mptcp_nl_cmd_announce, + .flags = GENL_UNS_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_REMOVE, + .doit = mptcp_nl_cmd_remove, + .flags = GENL_UNS_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, + .doit = mptcp_nl_cmd_sf_create, + .flags = GENL_UNS_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, + .doit = mptcp_nl_cmd_sf_destroy, + .flags = GENL_UNS_ADMIN_PERM, }, }; @@ -2043,13 +2280,14 @@ static struct genl_family mptcp_genl_family __ro_after_init = { .module = THIS_MODULE, .small_ops = mptcp_pm_ops, .n_small_ops = ARRAY_SIZE(mptcp_pm_ops), + .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; static int __net_init pm_nl_init_net(struct net *net) { - struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); @@ -2071,7 +2309,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { - struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); /* net is removed from namespace list, can't race with * other modifiers, also netns core already waited for a diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c new file mode 100644 index 000000000000..9e82250cbb70 --- /dev/null +++ b/net/mptcp/pm_userspace.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, Intel Corporation. + */ + +#include "protocol.h" +#include "mib.h" + +void mptcp_free_local_addr_list(struct mptcp_sock *msk) +{ + struct mptcp_pm_addr_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; + LIST_HEAD(free_list); + + if (!mptcp_pm_is_userspace(msk)) + return; + + spin_lock_bh(&msk->pm.lock); + list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); + spin_unlock_bh(&msk->pm.lock); + + list_for_each_entry_safe(entry, tmp, &free_list, list) { + sock_kfree_s(sk, entry, sizeof(*entry)); + } +} + +int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) +{ + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_pm_addr_entry *match = NULL; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *e; + bool addr_match = false; + bool id_match = false; + int ret = -EINVAL; + + bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); + if (addr_match && entry->addr.id == 0) + entry->addr.id = e->addr.id; + id_match = (e->addr.id == entry->addr.id); + if (addr_match && id_match) { + match = e; + break; + } else if (addr_match || id_match) { + break; + } + __set_bit(e->addr.id, id_bitmap); + } + + if (!match && !addr_match && !id_match) { + /* Memory for the entry is allocated from the + * sock option buffer. + */ + e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + if (!e) { + spin_unlock_bh(&msk->pm.lock); + return -ENOMEM; + } + + *e = *entry; + if (!e->addr.id) + e->addr.id = find_next_zero_bit(id_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, + 1); + list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); + ret = e->addr.id; + } else if (match) { + ret = entry->addr.id; + } + + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, + u8 *flags, int *ifindex) +{ + struct mptcp_pm_addr_entry *entry, *match = NULL; + + *flags = 0; + *ifindex = 0; + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (id == entry->addr.id) { + match = entry; + break; + } + } + spin_unlock_bh(&msk->pm.lock); + if (match) { + *flags = match->flags; + *ifindex = match->ifindex; + } + + return 0; +} + +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_addr_info *skc) +{ + struct mptcp_pm_addr_entry new_entry; + __be16 msk_sport = ((struct inet_sock *) + inet_sk((struct sock *)msk))->inet_sport; + + memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); + new_entry.addr = *skc; + new_entry.addr.id = 0; + new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + + if (new_entry.addr.port == msk_sport) + new_entry.addr.port = 0; + + return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry); +} + +int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_pm_addr_entry addr_val; + struct mptcp_sock *msk; + int err = -EINVAL; + u32 token_val; + + if (!addr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto announce_err; + } + + err = mptcp_pm_parse_entry(addr, info, true, &addr_val); + if (err < 0) { + GENL_SET_ERR_MSG(info, "error parsing local address"); + goto announce_err; + } + + if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + goto announce_err; + } + + err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val); + if (err < 0) { + GENL_SET_ERR_MSG(info, "did not match address and id"); + goto announce_err; + } + + lock_sock((struct sock *)msk); + spin_lock_bh(&msk->pm.lock); + + if (mptcp_pm_alloc_anno_list(msk, &addr_val)) { + mptcp_pm_announce_addr(msk, &addr_val.addr, false); + mptcp_pm_nl_addr_send_ack(msk); + } + + spin_unlock_bh(&msk->pm.lock); + release_sock((struct sock *)msk); + + err = 0; + announce_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; + struct mptcp_pm_addr_entry *match = NULL; + struct mptcp_pm_addr_entry *entry; + struct mptcp_sock *msk; + LIST_HEAD(free_list); + int err = -EINVAL; + u32 token_val; + u8 id_val; + + if (!id || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + id_val = nla_get_u8(id); + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto remove_err; + } + + lock_sock((struct sock *)msk); + + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (entry->addr.id == id_val) { + match = entry; + break; + } + } + + if (!match) { + GENL_SET_ERR_MSG(info, "address with specified id not found"); + release_sock((struct sock *)msk); + goto remove_err; + } + + list_move(&match->list, &free_list); + + mptcp_pm_remove_addrs_and_subflows(msk, &free_list); + + release_sock((struct sock *)msk); + + list_for_each_entry_safe(match, entry, &free_list, list) { + sock_kfree_s((struct sock *)msk, match, sizeof(*match)); + } + + err = 0; + remove_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info addr_r; + struct mptcp_addr_info addr_l; + struct mptcp_sock *msk; + int err = -EINVAL; + struct sock *sk; + u32 token_val; + + if (!laddr || !raddr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(genl_info_net(info), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto create_err; + } + + err = mptcp_pm_parse_addr(laddr, info, &addr_l); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + goto create_err; + } + + if (addr_l.id == 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); + goto create_err; + } + + err = mptcp_pm_parse_addr(raddr, info, &addr_r); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + goto create_err; + } + + sk = &msk->sk.icsk_inet.sk; + lock_sock(sk); + + err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); + + release_sock(sk); + + create_err: + sock_put((struct sock *)msk); + return err; +} + +static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, + const struct mptcp_addr_info *local, + const struct mptcp_addr_info *remote) +{ + struct mptcp_subflow_context *subflow; + + if (local->family != remote->family) + return NULL; + + mptcp_for_each_subflow(msk, subflow) { + const struct inet_sock *issk; + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + + if (local->family != ssk->sk_family) + continue; + + issk = inet_sk(ssk); + + switch (ssk->sk_family) { + case AF_INET: + if (issk->inet_saddr != local->addr.s_addr || + issk->inet_daddr != remote->addr.s_addr) + continue; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *pinfo = inet6_sk(ssk); + + if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) + continue; + break; + } +#endif + default: + continue; + } + + if (issk->inet_sport == local->port && + issk->inet_dport == remote->port) + return ssk; + } + + return NULL; +} + +int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info addr_l; + struct mptcp_addr_info addr_r; + struct mptcp_sock *msk; + struct sock *sk, *ssk; + int err = -EINVAL; + u32 token_val; + + if (!laddr || !raddr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(genl_info_net(info), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto destroy_err; + } + + err = mptcp_pm_parse_addr(laddr, info, &addr_l); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + goto destroy_err; + } + + err = mptcp_pm_parse_addr(raddr, info, &addr_r); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + goto destroy_err; + } + + if (addr_l.family != addr_r.family) { + GENL_SET_ERR_MSG(info, "address families do not match"); + goto destroy_err; + } + + if (!addr_l.port || !addr_r.port) { + GENL_SET_ERR_MSG(info, "missing local or remote port"); + goto destroy_err; + } + + sk = &msk->sk.icsk_inet.sk; + lock_sock(sk); + ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); + if (ssk) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); + mptcp_close_ssk(sk, ssk, subflow); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); + err = 0; + } else { + err = -ESRCH; + } + release_sock(sk); + +destroy_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, + struct mptcp_pm_addr_entry *loc, + struct mptcp_pm_addr_entry *rem, u8 bkup) +{ + struct mptcp_sock *msk; + int ret = -EINVAL; + u32 token_val; + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(net, token_val); + if (!msk) + return ret; + + if (!mptcp_pm_is_userspace(msk)) + goto set_flags_err; + + if (loc->addr.family == AF_UNSPEC || + rem->addr.family == AF_UNSPEC) + goto set_flags_err; + + lock_sock((struct sock *)msk); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); + release_sock((struct sock *)msk); + +set_flags_err: + sock_put((struct sock *)msk); + return ret; +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c82a76d2d0bf..b6dc6e260334 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -22,6 +22,7 @@ #endif #include <net/mptcp.h> #include <net/xfrm.h> +#include <asm/ioctls.h> #include "protocol.h" #include "mib.h" @@ -46,9 +47,10 @@ struct mptcp_skb_cb { enum { MPTCP_CMSG_TS = BIT(0), + MPTCP_CMSG_INQ = BIT(1), }; -static struct percpu_counter mptcp_sockets_allocated; +static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void __mptcp_check_send_data_fin(struct sock *sk); @@ -115,6 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; + + /* This is the first subflow, always with id 0 */ + subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); return 0; @@ -145,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; - kfree_skb_partial(from, fragstolen); + + /* note the fwd memory can reach a negative value after accounting + * for the delta, but the later skb free will restore a non + * negative one + */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); + kfree_skb_partial(from, fragstolen); + return true; } @@ -162,8 +173,8 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, static void __mptcp_rmem_reclaim(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; - mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + amount >>= PAGE_SHIFT; + mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT; __sk_mem_reduce_allocated(sk, amount); } @@ -176,8 +187,8 @@ static void mptcp_rmem_uncharge(struct sock *sk, int size) reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); /* see sk_mem_uncharge() for the rationale behind the following schema */ - if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) - __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK); + if (unlikely(reclaimable >= PAGE_SIZE)) + __mptcp_rmem_reclaim(sk, reclaimable); } static void mptcp_rfree(struct sk_buff *skb) @@ -211,7 +222,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - max_seq = READ_ONCE(msk->rcv_wnd_sent); + max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); @@ -220,7 +231,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, - (unsigned long long)msk->rcv_wnd_sent); + (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -318,20 +329,16 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) struct mptcp_sock *msk = mptcp_sk(sk); int amt, amount; - if (size < msk->rmem_fwd_alloc) + if (size <= msk->rmem_fwd_alloc) return true; + size -= msk->rmem_fwd_alloc; amt = sk_mem_pages(size); - amount = amt << SK_MEM_QUANTUM_SHIFT; - msk->rmem_fwd_alloc += amount; - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { - if (ssk->sk_forward_alloc < amount) { - msk->rmem_fwd_alloc -= amount; - return false; - } + amount = amt << PAGE_SHIFT; + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) + return false; - ssk->sk_forward_alloc -= amount; - } + msk->rmem_fwd_alloc += amount; return true; } @@ -464,9 +471,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; + + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) @@ -492,19 +502,24 @@ static void mptcp_set_timeout(struct sock *sk) __mptcp_set_timeout(sk, tout); } -static bool tcp_can_send_ack(const struct sock *ssk) +static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } -void mptcp_subflow_send_ack(struct sock *ssk) +void __mptcp_subflow_send_ack(struct sock *ssk) +{ + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); +} + +static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); - if (tcp_can_send_ack(ssk)) - tcp_send_ack(ssk); + __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } @@ -647,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { - /* if no data is found, a racing workqueue/recvmsg - * already processed the new data, stop here or we - * can enter an infinite loop + /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), + * a different CPU can have already processed the pending + * data, stop here or we can enter an infinite loop */ if (!moved) done = true; @@ -657,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } if (__mptcp_check_fallback(msk)) { - /* if we are running under the workqueue, TCP could have - * collapsed skbs between dummy map creation and now - * be sure to adjust the size + /* Under fallback skbs have no MPTCP extension and TCP could + * collapse them between the dummy map creation and the + * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; @@ -738,6 +753,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; + MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->ack_seq = end_seq; @@ -760,7 +776,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else - set_bit(MPTCP_ERROR_REPORT, &msk->flags); + __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number @@ -805,47 +821,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) mptcp_data_unlock(sk); } -static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) +static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { - struct mptcp_subflow_context *subflow; - bool ret = false; + struct sock *sk = (struct sock *)msk; - if (likely(list_empty(&msk->join_list))) + if (sk->sk_state != TCP_ESTABLISHED) return false; - spin_lock_bh(&msk->join_list_lock); - list_for_each_entry(subflow, &msk->join_list, node) { - u32 sseq = READ_ONCE(subflow->setsockopt_seq); - - mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); - if (READ_ONCE(msk->setsockopt_seq) != sseq) - ret = true; - } - list_splice_tail_init(&msk->join_list, &msk->conn_list); - spin_unlock_bh(&msk->join_list_lock); - - return ret; -} - -void __mptcp_flush_join_list(struct mptcp_sock *msk) -{ - if (likely(!mptcp_do_flush_join_list(msk))) - return; + /* attach to msk socket only after we are sure we will deal with it + * at close time + */ + if (sk->sk_socket && !ssk->sk_socket) + mptcp_sock_graft(ssk, sk->sk_socket); - if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); + mptcp_propagate_sndbuf((struct sock *)msk, ssk); + mptcp_sockopt_sync_locked(msk, ssk); + return true; } -static void mptcp_flush_join_list(struct mptcp_sock *msk) +static void __mptcp_flush_join_list(struct sock *sk) { - bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); - - might_sleep(); + struct mptcp_subflow_context *tmp, *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); - if (!mptcp_do_flush_join_list(msk) && !sync_needed) - return; + list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); - mptcp_sockopt_sync_all(msk); + list_move_tail(&subflow->node, &msk->conn_list); + if (!__mptcp_finish_join(msk, ssk)) + mptcp_subflow_reset(ssk); + unlock_sock_fast(ssk, slow); + } } static bool mptcp_timer_pending(struct sock *sk) @@ -966,23 +973,6 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static void __mptcp_mem_reclaim_partial(struct sock *sk) -{ - int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk); - - lockdep_assert_held_once(&sk->sk_lock.slock); - - __mptcp_rmem_reclaim(sk, reclaimable - 1); - sk_mem_reclaim_partial(sk); -} - -static void mptcp_mem_reclaim_partial(struct sock *sk) -{ - mptcp_data_lock(sk); - __mptcp_mem_reclaim_partial(sk); - mptcp_data_unlock(sk); -} - static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -1002,7 +992,6 @@ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - bool cleaned = false; u64 snd_una; /* on fallback we just need to ignore snd_una, as this is really @@ -1025,7 +1014,6 @@ static void __mptcp_clean_una(struct sock *sk) } dfrag_clear(sk, dfrag); - cleaned = true; } dfrag = mptcp_rtx_head(sk); @@ -1047,7 +1035,6 @@ static void __mptcp_clean_una(struct sock *sk) dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); - cleaned = true; } /* all retransmitted data acked, recovery completed */ @@ -1055,9 +1042,6 @@ static void __mptcp_clean_una(struct sock *sk) msk->recovery = false; out: - if (cleaned && tcp_under_memory_pressure(sk)) - __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt) && snd_una == READ_ONCE(msk->write_seq)) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) @@ -1139,18 +1123,21 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, - int avail_size) +static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, + u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); + u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; - if (!before64(data_seq + avail_size, window_end)) { - u64 allowed_size = window_end - data_seq; + mptcp_snd_wnd = window_end - data_seq; + avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); - return min_t(unsigned int, allowed_size, avail_size); + if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { + tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; @@ -1197,6 +1184,7 @@ static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, g tcp_skb_entail(ssk, skb); return skb; } + tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } @@ -1205,12 +1193,6 @@ static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, boo { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; - if (unlikely(tcp_under_memory_pressure(sk))) { - if (data_lock_held) - __mptcp_mem_reclaim_partial(sk); - else - mptcp_mem_reclaim_partial(sk); - } return __mptcp_alloc_tx_skb(sk, ssk, gfp); } @@ -1226,6 +1208,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } +static void mptcp_update_infinite_map(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_ext *mpext) +{ + if (!mpext) + return; + + mpext->infinite_map = 1; + mpext->data_len = 0; + + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); + mptcp_subflow_ctx(ssk)->send_infinite_map = 0; + pr_fallback(msk); + mptcp_do_fallback(ssk); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1248,6 +1246,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, info->limit > dfrag->data_len)) return 0; + if (unlikely(!__tcp_can_send(ssk))) + return -EAGAIN; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; @@ -1268,7 +1269,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } @@ -1286,7 +1287,7 @@ alloc_skb: } /* Zero window and all data acked? Probe. */ - copy = mptcp_check_allowed_size(msk, data_seq, copy); + copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); @@ -1357,6 +1358,9 @@ alloc_skb: out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); + if (mptcp_subflow_ctx(ssk)->send_infinite_map) + mptcp_update_infinite_map(msk, ssk, mpext); + trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } @@ -1369,7 +1373,7 @@ out: struct subflow_send_info { struct sock *ssk; - u64 ratio; + u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) @@ -1394,27 +1398,32 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return __mptcp_subflow_active(subflow); } +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { - struct subflow_send_info send_info[2]; + struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; + u64 linger_time; long tout = 0; - u64 ratio; - u32 pace; sock_owned_by_me(sk); if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; - return sk_stream_memory_free(msk->first) ? msk->first : NULL; + return __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first) ? msk->first : NULL; } /* re-use last subflow, if the burst allow that */ @@ -1426,10 +1435,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } /* pick the subflow with the lower wmem/wspace ratio */ - for (i = 0; i < 2; ++i) { + for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; - send_info[i].ratio = -1; + send_info[i].linger_time = -1; } + mptcp_for_each_subflow(msk, subflow) { trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); @@ -1438,34 +1448,56 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !subflow->backup; - if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) - continue; - - pace = READ_ONCE(ssk->sk_pacing_rate); - if (!pace) - continue; + pace = subflow->avg_pacing_rate; + if (unlikely(!pace)) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } - ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, - pace); - if (ratio < send_info[subflow->backup].ratio) { + linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].ratio = ratio; + send_info[subflow->backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) - send_info[0].ssk = send_info[1].ssk; + send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; + + /* According to the blest algorithm, to avoid HoL blocking for the + * faster flow, we need to: + * - estimate the faster flow linger time + * - use the above to estimate the amount of byte transferred + * by the faster flow + * - check that the amount of queued data is greter than the above, + * otherwise do not use the picked, slower, subflow + * We select the subflow with the shorter estimated time to flush + * the queued mem, which basically ensure the above. We just need + * to check that subflow has a non empty cwin. + */ + ssk = send_info[SSK_MODE_ACTIVE].ssk; + if (!ssk || !sk_stream_memory_free(ssk)) + return NULL; - if (send_info[0].ssk) { - msk->last_snd = send_info[0].ssk; - msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - tcp_sk(msk->last_snd)->snd_wnd); - return msk->last_snd; + burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + wmem = READ_ONCE(ssk->sk_wmem_queued); + if (!burst) { + msk->last_snd = NULL; + return ssk; } - return NULL; + subflow = mptcp_subflow_ctx(ssk); + subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + + READ_ONCE(ssk->sk_pacing_rate) * burst, + burst + wmem); + msk->last_snd = ssk; + msk->snd_burst = burst; + return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) @@ -1499,11 +1531,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, msk->snd_nxt = snd_nxt_new; } -static void mptcp_check_and_set_pending(struct sock *sk) +void mptcp_check_and_set_pending(struct sock *sk) { - if (mptcp_send_head(sk) && - !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + if (mptcp_send_head(sk)) + mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } void __mptcp_push_pending(struct sock *sk, unsigned int flags) @@ -1513,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) struct mptcp_sendmsg_info info = { .flags = flags, }; + bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len, copied = 0; + int len; while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@ -1524,7 +1556,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) int ret = 0; prev_ssk = ssk; - mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); /* First check. If the ssk has changed since @@ -1544,12 +1575,14 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { + if (ret == -EAGAIN) + continue; mptcp_push_release(ssk, &info); goto out; } + do_check_data_fin = true; info.sent += ret; - copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); @@ -1565,7 +1598,7 @@ out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - if (copied) + if (do_check_data_fin) __mptcp_check_send_data_fin(sk); } @@ -1640,10 +1673,42 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, + size_t len, int *copied_syn) +{ + unsigned int saved_flags = msg->msg_flags; + struct mptcp_sock *msk = mptcp_sk(sk); + int ret; + + lock_sock(ssk); + msg->msg_flags |= MSG_DONTWAIT; + msk->connect_flags = O_NONBLOCK; + msk->is_sendmsg = 1; + ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); + msk->is_sendmsg = 0; + msg->msg_flags = saved_flags; + release_sock(ssk); + + /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ + if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { + ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, msg->msg_flags, 1); + + /* Keep the same behaviour of plain TCP: zero the copied bytes in + * case of any error, except timeout or signal + */ + if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) + *copied_syn = 0; + } + + return ret; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; + struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1657,14 +1722,30 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + int copied_syn = 0; + + ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); + copied += copied_syn; + if (ret == -EINPROGRESS && copied_syn > 0) + goto out; + else if (ret) + goto do_error; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) - goto out; + goto do_error; } + ret = -EPIPE; + if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) + goto do_error; + pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { @@ -1673,11 +1754,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) bool dfrag_collapsed; size_t psize, offset; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ @@ -1709,7 +1785,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { ret = -EFAULT; - goto out; + goto do_error; } /* data successfully copied into the write queue */ @@ -1741,7 +1817,7 @@ wait_for_memory: __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) - goto out; + goto do_error; } if (copied) @@ -1749,7 +1825,14 @@ wait_for_memory: out: release_sock(sk); - return copied ? : ret; + return copied; + +do_error: + if (copied) + goto out; + + copied = sk_stream_error(sk, msg->msg_flags, ret); + goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, @@ -1784,8 +1867,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, copied += count; if (count < data_len) { - if (!(flags & MSG_PEEK)) + if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; + MPTCP_SKB_CB(skb)->map_seq += count; + } break; } @@ -1851,7 +1936,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; u64 rcvwin, grow; @@ -1869,7 +1954,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(rcvwin, advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; @@ -1927,7 +2012,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool ret, done; - mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; @@ -1965,8 +2049,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) return !skb_queue_empty(&msk->receive_queue); } +static unsigned int mptcp_inq_hint(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + const struct sk_buff *skb; + + skb = skb_peek(&msk->receive_queue); + if (skb) { + u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + + if (hint_val >= INT_MAX) + return INT_MAX; + + return (unsigned int)hint_val; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 1; + + return 0; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) + int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; @@ -1984,11 +2089,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; } - timeo = sock_rcvtimeo(sk, nonblock); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + if (unlikely(msk->recvmsg_inq)) + cmsg_flags = MPTCP_CMSG_INQ; + while (copied < len) { int bytes_read; @@ -2062,6 +2170,12 @@ out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); + + if (cmsg_flags & MPTCP_CMSG_INQ) { + unsigned int inq = mptcp_inq_hint(sk); + + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d", @@ -2088,7 +2202,7 @@ static void mptcp_retransmit_timer(struct timer_list *t) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ - set_bit(MPTCP_RETRANSMIT, &msk->flags); + __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); @@ -2196,6 +2310,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) return true; } +/* flags for __mptcp_close_ssk() */ +#define MPTCP_CF_PUSH BIT(1) +#define MPTCP_CF_FASTCLOSE BIT(2) + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -2205,22 +2323,43 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow) + struct mptcp_subflow_context *subflow, + unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push; + bool need_push, dispose_it; - list_del(&subflow->node); + dispose_it = !msk->subflow || ssk != msk->subflow->sk; + if (dispose_it) + list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + if (flags & MPTCP_CF_FASTCLOSE) { + /* be sure to force the tcp_disconnect() path, + * to generate the egress reset + */ + ssk->sk_lingertime = 0; + sock_set_flag(ssk, SOCK_LINGER); + subflow->send_fastclose = 1; + } + + need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); + if (!dispose_it) { + tcp_disconnect(ssk, 0); + msk->subflow->state = SS_UNCONNECTED; + mptcp_subflow_ctx_reset(subflow); + release_sock(ssk); + + goto out; + } + /* if we are invoked by the msk cleanup code, the subflow is * already orphaned */ if (ssk->sk_socket) sock_orphan(ssk); - need_push = __mptcp_retransmit_pending_data(sk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2231,6 +2370,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(ssk); + inet_csk_listen_stop(ssk); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2240,14 +2384,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, sock_put(ssk); - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (ssk == msk->first) msk->first = NULL; - if (msk->subflow && ssk == msk->subflow->sk) - mptcp_dispose_initial_subflow(msk); +out: + if (ssk == msk->last_snd) + msk->last_snd = NULL; if (need_push) __mptcp_push_pending(sk, 0); @@ -2258,7 +2400,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, { if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); - __mptcp_close_ssk(sk, ssk, subflow); + + /* subflow aborted before reaching the fully_established status + * attempt the creation of the next subflow + */ + mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow); + + __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -2272,7 +2420,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) might_sleep(); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (inet_sk_state_load(ssk) != TCP_CLOSE) @@ -2315,7 +2463,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) mptcp_token_destroy(msk); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; @@ -2327,12 +2475,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) unlock_sock_fast(tcp_sk, slow); } + /* Mirror the tcp_reset() error propagation */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); - mptcp_close_wake_up(sk); + /* the calling mptcp_worker will properly destroy the socket */ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) @@ -2387,6 +2554,7 @@ static void __mptcp_retrans(struct sock *sk) dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); @@ -2398,10 +2566,60 @@ reset_timer: mptcp_reset_timer(sk); } +/* schedule the timeout timer for the relevant event: either close timeout + * or mp_fail timeout. The close timeout takes precedence on the mp_fail one + */ +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) +{ + struct sock *sk = (struct sock *)msk; + unsigned long timeout, close_timeout; + + if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) + return; + + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; + + /* the close timeout takes precedence on the fail one, and here at least one of + * them is active + */ + timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; + + sk_reset_timer(sk, &sk->sk_timer, timeout); +} + +static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) +{ + struct sock *ssk = msk->first; + bool slow; + + if (!ssk) + return; + + pr_debug("MP_FAIL doesn't respond, reset the subflow"); + + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); + unlock_sock_fast(ssk, slow); + + mptcp_reset_timeout(msk, 0); +} + +static void mptcp_do_fastclose(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), + subflow, MPTCP_CF_FASTCLOSE); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned long fail_tout; int state; lock_sock(sk); @@ -2410,12 +2628,10 @@ static void mptcp_worker(struct work_struct *work) goto unlock; mptcp_check_data_fin_ack(sk); - mptcp_flush_join_list(msk); mptcp_check_fastclose(msk); - if (msk->pm.status) - mptcp_pm_nl_work(msk); + mptcp_pm_nl_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@ -2427,11 +2643,15 @@ static void mptcp_worker(struct work_struct *work) * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state */ - if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { - inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_destroy_sock(sk); - goto unlock; + if (sock_flag(sk, SOCK_DEAD)) { + if (mptcp_check_close_timeout(sk)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + goto unlock; + } } if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2440,6 +2660,10 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); + fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; + if (fail_tout && time_after(jiffies, fail_tout)) + mptcp_mp_fail_no_response(msk); + unlock: release_sock(sk); sock_put(sk); @@ -2449,8 +2673,6 @@ static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - spin_lock_init(&msk->join_list_lock); - INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); @@ -2465,6 +2687,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; mptcp_pm_data_init(msk); @@ -2476,9 +2699,20 @@ static int __mptcp_init_sock(struct sock *sk) return 0; } -static int mptcp_init_sock(struct sock *sk) +static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_assign_congestion_control(sk); + strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); + + /* no need to keep a reference to the ops, the name will suffice */ + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = NULL; +} + +static int mptcp_init_sock(struct sock *sk) +{ struct net *net = sock_net(sk); int ret; @@ -2499,16 +2733,11 @@ static int mptcp_init_sock(struct sock *sk) /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ - tcp_assign_congestion_control(sk); - strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); - - /* no need to keep a reference to the ops, the name will suffice */ - tcp_cleanup_congestion_control(sk); - icsk->icsk_ca_ops = NULL; + mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); return 0; } @@ -2523,7 +2752,7 @@ static void __mptcp_clear_xmit(struct sock *sk) dfrag_clear(sk, dfrag); } -static void mptcp_cancel_work(struct sock *sk) +void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2609,6 +2838,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) * state now */ if (__mptcp_check_fallback(msk)) { + WRITE_ONCE(msk->snd_una, msk->write_seq); if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_close_wake_up(sk); @@ -2617,7 +2847,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) } } - mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2642,31 +2871,16 @@ static void __mptcp_wr_shutdown(struct sock *sk) static void __mptcp_destroy_sock(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); pr_debug("msk=%p", msk); might_sleep(); - /* be sure to always acquire the join list lock, to sync vs - * mptcp_finish_join(). - */ - spin_lock_bh(&msk->join_list_lock); - list_splice_tail_init(&msk->join_list, &msk->conn_list); - spin_unlock_bh(&msk->join_list_lock); - list_splice_init(&msk->conn_list, &conn_list); - - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; - list_for_each_entry_safe(subflow, tmp, &conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow); - } - sk->sk_prot->destroy(sk); WARN_ON_ONCE(msk->rmem_fwd_alloc); @@ -2675,16 +2889,27 @@ static void __mptcp_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); - mptcp_dispose_initial_subflow(msk); sock_put(sk); } -static void mptcp_close(struct sock *sk, long timeout) +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; +} + +bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; - lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { @@ -2692,18 +2917,29 @@ static void mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_close_state(sk)) + if (mptcp_check_readable(msk)) { + /* the msk has read data, do the MPTCP equivalent of TCP reset */ + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); + } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + /* since the close timeout takes precedence on the fail one, + * cancel the latter + */ + if (ssk == msk->first) + subflow->fail_tout = 0; + sock_orphan(ssk); unlock_sock_fast(ssk, slow); } @@ -2711,23 +2947,34 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); + if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + mptcp_reset_timeout(msk, 0); } + + return do_cancel_work; +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + bool do_cancel_work; + + lock_sock(sk); + + do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); - if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); - sock_put(sk); } -static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -2752,18 +2999,37 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_do_flush_join_list(msk); + inet_sk_state_store(sk, TCP_CLOSE); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_stop_timer(sk); + sk_stop_timer(sk, &sk->sk_timer); - lock_sock(ssk); - tcp_disconnect(ssk, flags); - release_sock(ssk); - } + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + + /* msk->subflow is still intact, the following will not free the first + * subflow + */ + mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + msk->last_snd = NULL; + WRITE_ONCE(msk->flags, 0); + msk->cb_flags = 0; + msk->push_pending = 0; + msk->recovery = false; + msk->can_ack = false; + msk->fully_established = false; + msk->rcv_data_fin = false; + msk->snd_data_fin_enable = false; + msk->rcv_fastclose = false; + msk->use_64bit_ack = false; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + mptcp_pm_data_reset(msk); + mptcp_ca_reset(sk); + + sk->sk_shutdown = 0; + sk_error_report(sk); return 0; } @@ -2815,7 +3081,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -2879,7 +3145,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; - return newsk; + goto out; } /* acquire the 2nd reference for the owning socket */ @@ -2891,19 +3157,28 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } +out: + newsk->sk_kern_sock = kern; return newsk; } -void mptcp_destroy_common(struct mptcp_sock *msk) +void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { + struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); + /* join list will be eventually flushed (with rst) at sock lock release time */ + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); + mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it @@ -2912,13 +3187,18 @@ void mptcp_destroy_common(struct mptcp_sock *msk) msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); + mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_destroy_common(msk); + /* clears msk->subflow, allowing the following to close + * even the initial subflow + */ + mptcp_dispose_initial_subflow(msk); + mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } @@ -2927,7 +3207,7 @@ void __mptcp_data_acked(struct sock *sk) if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else - set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); @@ -2946,20 +3226,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) else if (xmit_ssk) mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND); } else { - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } } +#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ + BIT(MPTCP_RETRANSMIT) | \ + BIT(MPTCP_FLUSH_JOIN_LIST)) + /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) + __must_hold(&sk->sk_lock.slock) { - for (;;) { - unsigned long flags = 0; + struct mptcp_sock *msk = mptcp_sk(sk); - if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - flags |= BIT(MPTCP_PUSH_PENDING); - if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) - flags |= BIT(MPTCP_RETRANSMIT); + for (;;) { + unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | + msk->push_pending; if (!flags) break; @@ -2970,8 +3253,11 @@ static void mptcp_release_cb(struct sock *sk) * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ - + msk->push_pending = 0; + msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) + __mptcp_flush_join_list(sk); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) @@ -2981,15 +3267,19 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } - /* be sure to set the current sk state before tacking actions - * depending on sk_state - */ - if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) - __mptcp_set_connected(sk); - if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) - __mptcp_error_report(sk); + if (unlikely(&msk->cb_flags)) { + /* be sure to set the current sk state before tacking actions + * depending on sk_state, that is processing MPTCP_ERROR_REPORT + */ + if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) + __mptcp_set_connected(sk); + if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) + __mptcp_error_report(sk); + if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) + msk->last_snd = NULL; + } __mptcp_update_rmem(sk); } @@ -3030,7 +3320,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk); else - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); } @@ -3093,9 +3383,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); @@ -3116,8 +3406,7 @@ bool mptcp_finish_join(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; - struct socket *parent_sock; - bool ret; + bool ret = true; pr_debug("msk=%p, subflow=%p", msk, subflow); @@ -3127,38 +3416,39 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - if (!msk->pm.server_side) + if (!list_empty(&subflow->node)) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) { - subflow->reset_reason = MPTCP_RST_EPROHIBIT; - return false; - } + if (!mptcp_pm_allow_new_subflow(msk)) + goto err_prohibited; - /* active connections are already on conn_list, and we can't acquire - * msk lock here. - * use the join list lock as synchronization point and double-check - * msk status to avoid racing with __mptcp_destroy_sock() + /* active connections are already on conn_list. + * If we can't acquire msk socket lock here, let the release callback + * handle it */ - spin_lock_bh(&msk->join_list_lock); - ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { - list_add_tail(&subflow->node, &msk->join_list); + mptcp_data_lock(parent); + if (!sock_owned_by_user(parent)) { + ret = __mptcp_finish_join(msk, ssk); + if (ret) { + sock_hold(ssk); + list_add_tail(&subflow->node, &msk->conn_list); + } + } else { sock_hold(ssk); + list_add_tail(&subflow->node, &msk->join_list); + __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } - spin_unlock_bh(&msk->join_list_lock); + mptcp_data_unlock(parent); + if (!ret) { +err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } - /* attach to msk socket only after we are sure he will deal with us - * at close time - */ - parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !ssk->sk_socket) - mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); + out: mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; @@ -3177,10 +3467,135 @@ static int mptcp_forward_alloc_get(const struct sock *sk) return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; } +static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) +{ + const struct sock *sk = (void *)msk; + u64 delta; + + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + return 0; + + delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } + if (delta > INT_MAX) + delta = INT_MAX; + + return (int)delta; +} + +static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + bool slow; + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + __mptcp_move_skbs(msk); + answ = mptcp_inq_hint(sk); + release_sock(sk); + break; + case SIOCOUTQ: + slow = lock_sock_fast(sk); + answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); + unlock_sock_fast(sk, slow); + break; + case SIOCOUTQNSD: + slow = lock_sock_fast(sk); + answ = mptcp_ioctl_outq(msk, msk->snd_nxt); + unlock_sock_fast(sk, slow); + break; + default: + return -ENOIOCTLCMD; + } + + return put_user(answ, (int __user *)arg); +} + +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + +static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + int err = -EINVAL; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + return -EINVAL; + + mptcp_token_destroy(msk); + inet_sk_state_store(sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_early_fallback(msk, subflow); +#endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { + MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + mptcp_subflow_early_fallback(msk, subflow); + } + if (likely(!__mptcp_check_fallback(msk))) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); + + /* if reaching here via the fastopen/sendmsg path, the caller already + * acquired the subflow socket lock, too. + */ + if (msk->is_sendmsg) + err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + else + err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + + /* on successful connect, the msk state will be moved to established by + * subflow_finish_connect() + */ + if (unlikely(err && err != -EINPROGRESS)) { + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + return err; + } + + mptcp_copy_inaddrs(sk, ssock->sk); + + /* unblocking connect, mptcp-level inet_stream_connect will error out + * without changing the socket state, update it here. + */ + if (err == -EINPROGRESS) + sk->sk_socket->state = ssock->state; + return err; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, @@ -3189,6 +3604,7 @@ static struct proto mptcp_prot = { .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, + .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, @@ -3196,7 +3612,10 @@ static struct proto mptcp_prot = { .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, .sockets_allocated = &mptcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, + .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), @@ -3228,68 +3647,16 @@ unlock: return err; } -static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) -{ - subflow->request_mptcp = 0; - __mptcp_do_fallback(msk); -} - static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { - struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; - struct socket *ssock; - int err; + int ret; lock_sock(sock->sk); - if (sock->state != SS_UNCONNECTED && msk->subflow) { - /* pending connection or invalid state, let existing subflow - * cope with that - */ - ssock = msk->subflow; - goto do_connect; - } - - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; - goto unlock; - } - - mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); -#ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_early_fallback(msk, subflow); -#endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); - mptcp_subflow_early_fallback(msk, subflow); - } - if (likely(!__mptcp_check_fallback(msk))) - MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); - -do_connect: - err = ssock->ops->connect(ssock, uaddr, addr_len, flags); - sock->state = ssock->state; - - /* on successful connect, the msk state will be moved to established by - * subflow_finish_connect() - */ - if (!err || err == -EINPROGRESS) - mptcp_copy_inaddrs(sock->sk, ssock->sk); - else - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - -unlock: + mptcp_sk(sock->sk)->connect_flags = flags; + ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); - return err; + return ret; } static int mptcp_listen(struct socket *sock, int backlog) @@ -3330,17 +3697,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - lock_sock(sock->sk); - if (sock->sk->sk_state != TCP_LISTEN) - goto unlock_fail; - ssock = __mptcp_nmpc_socket(msk); if (!ssock) - goto unlock_fail; - - clear_bit(MPTCP_DATA_READY, &msk->flags); - sock_hold(ssock->sk); - release_sock(sock->sk); + return -EINVAL; err = ssock->ops->accept(sock, newsock, flags, kern); if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { @@ -3363,14 +3722,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (mptcp_is_fully_established(newsk)) mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); - mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); mptcp_propagate_sndbuf(newsk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ - mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -3380,26 +3737,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, release_sock(newsk); } - if (inet_csk_listen_poll(ssock->sk)) - set_bit(MPTCP_DATA_READY, &msk->flags); - sock_put(ssock->sk); return err; - -unlock_fail: - release_sock(sock->sk); - return -EINVAL; -} - -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) -{ - /* Concurrent splices from sk_receive_queue into receive_queue will - * always show at least one non-empty queue when checked in this order. - */ - if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && - skb_queue_empty_lockless(&msk->receive_queue)) - return 0; - - return EPOLLIN | EPOLLRDNORM; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) @@ -3433,19 +3771,26 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); - if (state == TCP_LISTEN) - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; + if (state == TCP_LISTEN) { + if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) + return 0; + + return inet_csk_listen_poll(msk->subflow->sk); + } if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + /* cf tcp_poll() note about TFO */ + mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (sk->sk_err) mask |= EPOLLERR; @@ -3530,8 +3875,8 @@ void __init mptcp_proto_init(void) for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); - netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, + mptcp_napi_poll); napi_enable(&delegated->napi); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d87cc040352e..6a09ab99a12d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -11,6 +11,7 @@ #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> +#include <net/genetlink.h> #define MPTCP_SUPPORTED_VERSION 1 @@ -82,7 +83,6 @@ /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) -#define MPTCPOPT_HMAC_LEN 20 #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ @@ -110,19 +110,21 @@ /* MPTCP TCPRST flags */ #define MPTCP_RST_TRANSIENT BIT(0) -/* MPTCP socket flags */ -#define MPTCP_DATA_READY 0 +/* MPTCP socket atomic flags */ #define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 -#define MPTCP_PUSH_PENDING 6 -#define MPTCP_CLEAN_UNA 7 -#define MPTCP_ERROR_REPORT 8 -#define MPTCP_RETRANSMIT 9 -#define MPTCP_WORK_SYNC_SETSOCKOPT 10 -#define MPTCP_CONNECTED 11 + +/* MPTCP socket release cb flags */ +#define MPTCP_PUSH_PENDING 1 +#define MPTCP_CLEAN_UNA 2 +#define MPTCP_ERROR_REPORT 3 +#define MPTCP_RETRANSMIT 4 +#define MPTCP_FLUSH_JOIN_LIST 5 +#define MPTCP_CONNECTED 6 +#define MPTCP_RESET_SCHEDULER 7 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -174,20 +176,38 @@ enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, - MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_SUBFLOW_ESTABLISHED, + MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ + MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is + * accounted int id_avail_bitmap + */ +}; + +enum mptcp_pm_type { + MPTCP_PM_TYPE_KERNEL = 0, + MPTCP_PM_TYPE_USERSPACE, + + __MPTCP_PM_TYPE_NR, + __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, }; +/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ +#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) + enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_RM_ADDR_SIGNAL, }; +/* max value of mptcp_addr_info.id */ +#define MPTCP_PM_MAX_ADDR_ID U8_MAX + struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; + struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ @@ -200,12 +220,22 @@ struct mptcp_pm_data { u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; + u8 pm_type; u8 subflows; u8 status; + DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; }; +struct mptcp_pm_addr_entry { + struct list_head list; + struct mptcp_addr_info addr; + u8 flags; + int ifindex; + struct socket *lsk; +}; + struct mptcp_data_frag { struct list_head list; u64 data_seq; @@ -225,7 +255,7 @@ struct mptcp_sock { u64 write_seq; u64 snd_nxt; u64 ack_seq; - u64 rcv_wnd_sent; + atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; int rmem_fwd_alloc; struct sock *last_snd; @@ -241,6 +271,8 @@ struct mptcp_sock { u32 token; int rmem_released; unsigned long flags; + unsigned long cb_flags; + unsigned long push_pending; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; @@ -249,7 +281,13 @@ struct mptcp_sock { bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; - spinlock_t join_list_lock; + bool allow_infinite_fallback; + u8 mpc_endpoint_id; + u8 recvmsg_inq:1, + cork:1, + nodelay:1, + is_sendmsg:1; + int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -270,6 +308,7 @@ struct mptcp_sock { u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; + struct mptcp_sock *dl_next; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -277,6 +316,8 @@ struct mptcp_sock { #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) +#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ + list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) static inline void msk_owned_by_me(const struct mptcp_sock *msk) { @@ -392,6 +433,10 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ + + struct_group(reset, + + unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; @@ -419,10 +464,14 @@ struct mptcp_subflow_context { backup : 1, send_mp_prio : 1, send_mp_fail : 1, + send_fastclose : 1, + send_infinite_map : 1, rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ - stale : 1; /* unable to snd/rcv data, do not use for xmit */ + stale : 1, /* unable to snd/rcv data, do not use for xmit */ + local_id_valid : 1, /* local_id is correctly initialized */ + valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -437,6 +486,10 @@ struct mptcp_subflow_context { u8 stale_count; long delegated_status; + unsigned long fail_tout; + + ); + struct list_head delegated_node; /* link into delegated_action, protected by local BH */ u32 setsockopt_seq; @@ -445,9 +498,7 @@ struct mptcp_subflow_context { struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; - void (*tcp_data_ready)(struct sock *sk); void (*tcp_state_change)(struct sock *sk); - void (*tcp_write_space)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; @@ -468,6 +519,13 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) return subflow->tcp_sock; } +static inline void +mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) +{ + memset(&subflow->reset, 0, sizeof(subflow->reset)); + subflow->request_mptcp = 1; +} + static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { @@ -482,15 +540,6 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } -static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) -{ - sock_hold(mptcp_subflow_tcp_sock(subflow)); - spin_lock_bh(&msk->join_list_lock); - list_add_tail(&subflow->node, &msk->join_list); - spin_unlock_bh(&msk->join_list_lock); -} - void mptcp_subflow_process_delegated(struct sock *ssk); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) @@ -551,19 +600,28 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); +int mptcp_get_pm_type(const struct net *net); +void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); +void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); -void mptcp_subflow_send_ack(struct sock *ssk); +void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); +bool __mptcp_close(struct sock *sk, long timeout); +void mptcp_cancel_work(struct sock *sk); + +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, @@ -573,16 +631,19 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); -static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) +static inline bool __tcp_can_send(const struct sock *ssk) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + /* only send if our side has not closed yet */ + return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} +static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !subflow->fully_established) return false; - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); + return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); @@ -592,27 +653,14 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { - sk->sk_data_ready = ctx->tcp_data_ready; + sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; - sk->sk_write_space = ctx->tcp_write_space; + sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -static inline bool mptcp_has_another_subflow(struct sock *ssk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk), *tmp; - struct mptcp_sock *msk = mptcp_sk(subflow->conn); - - mptcp_for_each_subflow(msk, tmp) { - if (tmp != subflow) - return true; - } - - return false; -} - void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); @@ -621,12 +669,12 @@ int __init mptcp_proto_v6_init(void); struct sock *mptcp_sk_clone(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct request_sock *req); -void mptcp_get_options(const struct sock *sk, - const struct sk_buff *skb, +void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void __mptcp_set_connected(struct sock *sk); +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && @@ -654,7 +702,6 @@ void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); -void __mptcp_flush_join_list(struct mptcp_sock *msk); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && @@ -680,7 +727,7 @@ static inline void mptcp_write_space(struct sock *sk) } } -void mptcp_destroy_common(struct mptcp_sock *msk); +void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); #define MPTCP_TOKEN_MAX_RETRIES 4 @@ -704,9 +751,16 @@ void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); +__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_data_reset(struct mptcp_sock *msk); +int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + struct mptcp_addr_info *addr); +int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); @@ -714,38 +768,63 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); -void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); -void mptcp_pm_add_addr_received(struct mptcp_sock *msk, +bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); +void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, + const struct mptcp_subflow_context *subflow); +void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, - struct mptcp_addr_info *addr); + const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); +int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup); +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_pm_addr_entry *entry); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, bool check_id); + const struct mptcp_addr_info *addr, bool check_id); struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr); -int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, u8 *flags, int *ifindex); - +int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, + u8 *flags, int *ifindex); +int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, + struct mptcp_pm_addr_entry *loc, + struct mptcp_pm_addr_entry *rem, u8 bkup); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); +void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list); + +int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry); +void mptcp_free_local_addr_list(struct mptcp_sock *msk); +int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); -void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info); +void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); +bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { @@ -768,6 +847,16 @@ static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } +static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; +} + +static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; +} + static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; @@ -791,27 +880,41 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, - bool *port, bool *drop_other_suboptions); + bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); void __init mptcp_pm_nl_init(void); -void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); + +/* called under PM lock */ +static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) +{ + if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + WRITE_ONCE(msk->pm.accept_subflow, true); +} + +static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) +{ + spin_lock_bh(&msk->pm.lock); + __mptcp_pm_close_subflow(msk); + spin_unlock_bh(&msk->pm.lock); +} void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); -void mptcp_sockopt_sync_all(struct mptcp_sock *msk); +void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { @@ -842,23 +945,51 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } -static inline void mptcp_do_fallback(struct sock *sk) +static inline void mptcp_do_fallback(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; + msk = mptcp_sk(sk); __mptcp_do_fallback(msk); + if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { + gfp_t saved_allocation = ssk->sk_allocation; + + /* we are in a atomic (BH) scope, override ssk default for data + * fin allocation + */ + ssk->sk_allocation = GFP_ATOMIC; + ssk->sk_shutdown |= SEND_SHUTDOWN; + tcp_shutdown(ssk, SEND_SHUTDOWN); + ssk->sk_allocation = saved_allocation; + } } #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) +static inline bool mptcp_check_infinite_map(struct sk_buff *skb) +{ + struct mptcp_ext *mpext; + + mpext = skb ? mptcp_get_ext(skb) : NULL; + if (mpext && mpext->infinite_map) + return true; + + return false; +} + +static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) +{ + return (subflow->request_mptcp || subflow->request_join); +} + static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *parent = subflow->conn; return sk->sk_state == TCP_ESTABLISHED && - !mptcp_sk(parent)->pm.server_side && + is_active_ssk(subflow) && !subflow->conn_finished; } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 0f1e661c2032..c7cb68c725b2 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -343,6 +343,8 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_RCVLOWAT: case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: case SO_BUSY_POLL: case SO_PREFER_BUSY_POLL: case SO_BUSY_POLL_BUDGET: @@ -390,6 +392,8 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, switch (optname) { case IPV6_V6ONLY: + case IPV6_TRANSPARENT: + case IPV6_FREEBIND: lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); if (!ssock) { @@ -398,8 +402,24 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, } ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); - if (ret == 0) + if (ret != 0) { + release_sock(sk); + return ret; + } + + sockopt_seq_inc(msk); + + switch (optname) { + case IPV6_V6ONLY: sk->sk_ipv6only = ssock->sk->sk_ipv6only; + break; + case IPV6_TRANSPARENT: + inet_sk(sk)->transparent = inet_sk(ssock->sk)->transparent; + break; + case IPV6_FREEBIND: + inet_sk(sk)->freebind = inet_sk(ssock->sk)->freebind; + break; + } release_sock(sk); break; @@ -525,7 +545,6 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_NODELAY: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_CONGESTION: - case TCP_ULP: case TCP_CORK: case TCP_KEEPIDLE: case TCP_KEEPINTVL: @@ -539,6 +558,8 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_TIMESTAMP: case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: + case TCP_INQ: + case TCP_FASTOPEN_CONNECT: return true; } @@ -547,10 +568,9 @@ static bool mptcp_supported_sockopt(int level, int optname) /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ - /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE, + /* TCP_FASTOPEN_KEY, TCP_FASTOPEN, TCP_FASTOPEN_NO_COOKIE, * are not supported fastopen is currently unsupported */ - /* TCP_INQ is currently unsupported, needs some recvmsg work */ } return false; } @@ -598,14 +618,200 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t return ret; } +static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + lock_sock(sk); + sockopt_seq_inc(msk); + msk->cork = !!val; + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + __tcp_sock_set_cork(ssk, !!val); + release_sock(ssk); + } + if (!val) + mptcp_check_and_set_pending(sk); + release_sock(sk); + + return 0; +} + +static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + lock_sock(sk); + sockopt_seq_inc(msk); + msk->nodelay = !!val; + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + __tcp_sock_set_nodelay(ssk, !!val); + release_sock(ssk); + } + if (val) + mptcp_check_and_set_pending(sk); + release_sock(sk); + + return 0; +} + +static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct inet_sock *issk; + struct socket *ssock; + int err; + + err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); + if (err != 0) + return err; + + lock_sock(sk); + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + issk = inet_sk(ssock->sk); + + switch (optname) { + case IP_FREEBIND: + issk->freebind = inet_sk(sk)->freebind; + break; + case IP_TRANSPARENT: + issk->transparent = inet_sk(sk)->transparent; + break; + default: + release_sock(sk); + WARN_ON_ONCE(1); + return -EOPNOTSUPP; + } + + sockopt_seq_inc(msk); + release_sock(sk); + return 0; +} + +static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int err, val; + + err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen); + + if (err != 0) + return err; + + lock_sock(sk); + sockopt_seq_inc(msk); + val = inet_sk(sk)->tos; + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + __ip_sock_set_tos(ssk, val); + } + release_sock(sk); + + return err; +} + +static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + switch (optname) { + case IP_FREEBIND: + case IP_TRANSPARENT: + return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen); + case IP_TOS: + return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); + } + + return -EOPNOTSUPP; +} + +static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct socket *listener; + + listener = __mptcp_nmpc_socket(msk); + if (!listener) + return 0; /* TCP_DEFER_ACCEPT does not fail */ + + return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen); +} + +static int mptcp_setsockopt_sol_tcp_fastopen_connect(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct socket *sock; + + /* Limit to first subflow */ + sock = __mptcp_nmpc_socket(msk); + if (!sock) + return -EINVAL; + + return tcp_setsockopt(sock->sk, SOL_TCP, TCP_FASTOPEN_CONNECT, optval, optlen); +} + static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (void *)msk; + int ret, val; + switch (optname) { + case TCP_INQ: + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + if (val < 0 || val > 1) + return -EINVAL; + + lock_sock(sk); + msk->recvmsg_inq = !!val; + release_sock(sk); + return 0; case TCP_ULP: return -EOPNOTSUPP; case TCP_CONGESTION: return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); + case TCP_CORK: + return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen); + case TCP_NODELAY: + return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); + case TCP_DEFER_ACCEPT: + return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); + case TCP_FASTOPEN_CONNECT: + return mptcp_setsockopt_sol_tcp_fastopen_connect(msk, optval, optlen); } return -EOPNOTSUPP; @@ -637,6 +843,9 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, if (ssk) return tcp_setsockopt(ssk, level, optname, optval, optlen); + if (level == SOL_IP) + return mptcp_setsockopt_v4(msk, optname, optval, optlen); + if (level == SOL_IPV6) return mptcp_setsockopt_v6(msk, optname, optval, optlen); @@ -674,15 +883,11 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { - struct sock *sk = &msk->sk.icsk_inet.sk; u32 flags = 0; - bool slow; u8 val; memset(info, 0, sizeof(*info)); - slow = lock_sock_fast(sk); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); @@ -703,8 +908,6 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); - - unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); @@ -932,6 +1135,35 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o return 0; } +static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, + int __user *optlen, int val) +{ + int len; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { + unsigned char ucval = (unsigned char)val; + + len = 1; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &ucval, 1)) + return -EFAULT; + } else { + len = min_t(unsigned int, len, sizeof(int)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + } + + return 0; +} + static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -940,12 +1172,33 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_CONGESTION: case TCP_INFO: case TCP_CC_INFO: + case TCP_DEFER_ACCEPT: + case TCP_FASTOPEN_CONNECT: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); + case TCP_INQ: + return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq); + case TCP_CORK: + return mptcp_put_int_option(msk, optval, optlen, msk->cork); + case TCP_NODELAY: + return mptcp_put_int_option(msk, optval, optlen, msk->nodelay); } return -EOPNOTSUPP; } +static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (void *)msk; + + switch (optname) { + case IP_TOS: + return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos); + } + + return -EOPNOTSUPP; +} + static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -981,6 +1234,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, if (ssk) return tcp_getsockopt(ssk, level, optname, optval, option); + if (level == SOL_IP) + return mptcp_getsockopt_v4(msk, optname, optval, option); if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); if (level == SOL_MPTCP) @@ -1003,6 +1258,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; ssk->sk_incoming_cpu = sk->sk_incoming_cpu; + __ip_sock_set_tos(ssk, inet_sk(sk)->tos); if (sk->sk_userlocks & tx_rx_locks) { ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; @@ -1028,6 +1284,11 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops) tcp_set_congestion_control(ssk, msk->ca_name, false, true); + __tcp_sock_set_cork(ssk, !!msk->cork); + __tcp_sock_set_nodelay(ssk, !!msk->nodelay); + + inet_sk(ssk)->transparent = inet_sk(sk)->transparent; + inet_sk(ssk)->freebind = inet_sk(sk)->freebind; } static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) @@ -1052,27 +1313,15 @@ void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) } } -void mptcp_sockopt_sync_all(struct mptcp_sock *msk) +void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk) { - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - u32 seq; - - seq = sockopt_seq_reset(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - u32 sseq = READ_ONCE(subflow->setsockopt_seq); + msk_owned_by_me(msk); - if (sseq != msk->setsockopt_seq) { - __mptcp_sockopt_sync(msk, ssk); - WRITE_ONCE(subflow->setsockopt_seq, seq); - } else if (sseq != seq) { - WRITE_ONCE(subflow->setsockopt_seq, seq); - } + if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { + sync_socket_options(msk, ssk); - cond_resched(); + subflow->setsockopt_seq = msk->setsockopt_seq; } - - msk->setsockopt_seq = seq; } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 6172f380dfb7..02a54d59697b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -62,7 +62,9 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && - READ_ONCE(msk->pm.accept_subflow); + ((mptcp_pm_is_userspace(msk) && + mptcp_userspace_pm_active(msk)) || + READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ @@ -153,7 +155,7 @@ static int subflow_check_req(struct request_sock *req, return -EINVAL; #endif - mptcp_get_options(sk_listener, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); @@ -250,7 +252,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, int err; subflow_init_req(req, sk_listener); - mptcp_get_options(sk_listener, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); @@ -344,9 +346,7 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", - subflow, subflow->token, - (unsigned long long)thmac, - (unsigned long long)subflow->thmac); + subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } @@ -388,7 +388,7 @@ static void mptcp_set_connected(struct sock *sk) if (!sock_owned_by_user(sk)) __mptcp_set_connected(sk); else - set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } @@ -410,7 +410,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { MPTCP_INC_STATS(sock_net(sk), @@ -443,6 +443,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; + subflow->remote_id = mp_opt.join_id; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -483,9 +484,53 @@ do_reset: mptcp_subflow_reset(sk); } +static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) +{ + subflow->local_id = local_id; + subflow->local_id_valid = 1; +} + +static int subflow_chk_local_id(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + int err; + + if (likely(subflow->local_id_valid)) + return 0; + + err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); + if (err < 0) + return err; + + subflow_set_local_id(subflow, err); + return 0; +} + +static int subflow_rebuild_header(struct sock *sk) +{ + int err = subflow_chk_local_id(sk); + + if (unlikely(err < 0)) + return err; + + return inet_sk_rebuild_header(sk); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int subflow_v6_rebuild_header(struct sock *sk) +{ + int err = subflow_chk_local_id(sk); + + if (unlikely(err < 0)) + return err; + + return inet6_sk_rebuild_header(sk); +} +#endif + struct request_sock_ops mptcp_subflow_request_sock_ops; -EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops); -static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops; +static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { @@ -506,9 +551,9 @@ drop: } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops; -static struct inet_connection_sock_af_ops subflow_v6_specific; -static struct inet_connection_sock_af_ops subflow_v6m_specific; +static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; +static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; +static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) @@ -557,29 +602,6 @@ static bool subflow_hmac_valid(const struct request_sock *req, return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } -static void mptcp_sock_destruct(struct sock *sk) -{ - /* if new mptcp socket isn't accepted, it is free'd - * from the tcp listener sockets request queue, linked - * from req->sk. The tcp socket is released. - * This calls the ULP release function which will - * also remove the mptcp socket, via - * sock_put(ctx->conn). - * - * Problem is that the mptcp socket will be in - * ESTABLISHED state and will not have the SOCK_DEAD flag. - * Both result in warnings from inet_sock_destruct. - */ - if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { - sk->sk_state = TCP_CLOSE; - WARN_ON_ONCE(sk->sk_socket); - sock_orphan(sk); - } - - mptcp_destroy_common(mptcp_sk(sk)); - inet_sock_destruct(sk); -} - static void mptcp_force_close(struct sock *sk) { /* the msk is not yet exposed to user-space */ @@ -663,7 +685,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * reordered MPC will cause fallback, but we don't have other * options. */ - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { fallback = true; goto create_child; @@ -673,7 +695,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { - mptcp_get_options(sk, skb, &mp_opt); + mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { @@ -701,6 +723,8 @@ create_child: goto dispose_child; } + if (new_msk) + mptcp_copy_inaddrs(new_msk, child); subflow_drop_ctx(child); goto out; } @@ -722,13 +746,17 @@ create_child: /* new mpc subflow takes ownership of the newly * created mptcp socket */ - new_msk->sk_destruct = mptcp_sock_destruct; mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; + /* set msk addresses early to ensure mptcp_pm_get_local_id() + * uses the correct data + */ + mptcp_copy_inaddrs(ctx->conn, child); + /* with OoO packets we can reach here without ingress * mpc option */ @@ -790,7 +818,7 @@ dispose_child: return child; } -static struct inet_connection_sock_af_ops subflow_specific; +static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override; enum mapping_status { @@ -798,7 +826,8 @@ enum mapping_status { MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, - MAPPING_DUMMY + MAPPING_DUMMY, + MAPPING_BAD_CSUM }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) @@ -845,9 +874,8 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct csum_pseudo_header header; u32 offset, seq, delta; - __wsum csum; + __sum16 csum; int len; if (!csum_reqd) @@ -908,19 +936,16 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * * while the pseudo header requires the original DSS data len, * including that */ - header.data_seq = cpu_to_be64(subflow->map_seq); - header.subflow_seq = htonl(subflow->map_subflow_seq); - header.data_len = htons(subflow->map_data_len + subflow->map_data_fin); - header.csum = 0; - - csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); - if (unlikely(csum_fold(csum))) { + csum = __mptcp_make_csum(subflow->map_seq, + subflow->map_subflow_seq, + subflow->map_data_len + subflow->map_data_fin, + subflow->map_data_csum); + if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); - return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + return MAPPING_BAD_CSUM; } + subflow->valid_csum_seen = 1; return MAPPING_OK; } @@ -967,7 +992,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { + pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); + subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1102,6 +1129,45 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss } } +static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) +{ + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + if (subflow->mp_join) + return false; + else if (READ_ONCE(msk->csum_enabled)) + return !subflow->valid_csum_seen; + else + return !subflow->fully_established; +} + +static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned long fail_tout; + + /* greceful failure can happen only on the MPC subflow */ + if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) + return; + + /* since the close timeout take precedence on the fail one, + * no need to start the latter when the first is already set + */ + if (sock_flag((struct sock *)msk, SOCK_DEAD)) + return; + + /* we don't need extreme accuracy here, use a zero fail_tout as special + * value meaning no fail timeout at all; + */ + fail_tout = jiffies + TCP_RTO_MAX; + if (!fail_tout) + fail_tout = 1; + WRITE_ONCE(subflow->fail_tout, fail_tout); + tcp_send_ack(ssk); + + mptcp_reset_timeout(msk, subflow->fail_tout); +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1110,7 +1176,7 @@ static bool subflow_check_data_avail(struct sock *ssk) struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); if (subflow->data_avail) return true; @@ -1121,10 +1187,8 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); - if (unlikely(status == MAPPING_INVALID)) - goto fallback; - - if (unlikely(status == MAPPING_DUMMY)) + if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || + status == MAPPING_BAD_CSUM)) goto fallback; if (status != MAPPING_OK) @@ -1164,35 +1228,42 @@ no_data: return false; fallback: - /* RFC 8684 section 3.7. */ - if (subflow->send_mp_fail) { - if (mptcp_has_another_subflow(ssk)) { + if (!__mptcp_check_fallback(msk)) { + /* RFC 8684 section 3.7. */ + if (status == MAPPING_BAD_CSUM && + (subflow->mp_join || subflow->valid_csum_seen)) { + subflow->send_mp_fail = 1; + + if (!READ_ONCE(msk->allow_infinite_fallback)) { + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; + goto reset; + } + mptcp_subflow_fail(msk, ssk); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); + return true; + } + + if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + /* fatal protocol error, close the socket. + * subflow_error_report() will introduce the appropriate barriers + */ + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; + +reset: + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); + tcp_send_active_reset(ssk, GFP_ATOMIC); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + return false; } - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); - subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; - tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, 0); - return true; - } - if (subflow->mp_join || subflow->fully_established) { - /* fatal protocol error, close the socket. - * subflow_error_report() will introduce the appropriate barriers - */ - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); - subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; - tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, 0); - return false; + mptcp_do_fallback(ssk); } - __mptcp_do_fallback(msk); skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); @@ -1210,7 +1281,7 @@ bool mptcp_subflow_data_available(struct sock *sk) if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; - WRITE_ONCE(subflow->data_avail, 0); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); pr_debug("Done with mapping: seq=%u data_len=%u", subflow->map_subflow_seq, @@ -1274,7 +1345,7 @@ static void subflow_error_report(struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else - set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } @@ -1293,7 +1364,6 @@ static void subflow_data_ready(struct sock *sk) if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; - set_bit(MPTCP_DATA_READY, &msk->flags); parent->sk_data_ready(parent); return; } @@ -1315,7 +1385,7 @@ static void subflow_write_space(struct sock *ssk) mptcp_write_space(sk); } -static struct inet_connection_sock_af_ops * +static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1330,7 +1400,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_connection_sock_af_ops *target; + const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); @@ -1384,20 +1454,20 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, struct sockaddr_storage addr; int remote_id = remote->id; int local_id = loc->id; + int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; int ifindex; u8 flags; - int err; if (!mptcp_is_fully_established(sk)) - return -ENOTCONN; + goto err_out; err = mptcp_subflow_create_socket(sk, &sf); if (err) - return err; + goto err_out; ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); @@ -1405,15 +1475,10 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); - if (!local_id) { - err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk); - if (err < 0) - goto failed; + if (local_id) + subflow_set_local_id(subflow, local_id); - local_id = err; - } - - mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, + mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; @@ -1425,6 +1490,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif + mptcp_sockopt_sync(msk, ssk); + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) @@ -1434,14 +1501,13 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; - subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr, ssk->sk_family); - mptcp_add_pending_subflow(msk, subflow); - mptcp_sockopt_sync(msk, ssk); + sock_hold(ssk); + list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink; @@ -1449,17 +1515,22 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); - return err; + WRITE_ONCE(msk->allow_infinite_fallback, false); + return 0; failed_unlink: - spin_lock_bh(&msk->join_list_lock); list_del(&subflow->node); - spin_unlock_bh(&msk->join_list_lock); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); + +err_out: + /* we account subflows before the creation, and this failures will not + * be caught by sk_state_change() + */ + mptcp_pm_close_subflow(msk); return err; } @@ -1533,10 +1604,8 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) * needs it. */ sf->sk->sk_net_refcnt = 1; - get_net(net); -#ifdef CONFIG_PROC_FS - this_cpu_add(*net->core.sock_inuse, 1); -#endif + get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); err = tcp_set_ulp(sf->sk, "mptcp"); release_sock(sf->sk); @@ -1548,7 +1617,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) /* the newly created socket really belongs to the owning MPTCP master * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the - * procfs/diag interaces really show this one belonging to the correct + * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; @@ -1637,6 +1706,64 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + struct mptcp_sock *msk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + msk = mptcp_sk(subflow->conn); + if (msk->dl_next || msk == head) + continue; + + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + struct sock *sk = (struct sock *)msk; + bool slow, do_cancel_work; + + sock_hold(sk); + slow = lock_sock_fast_nested(sk); + next = msk->dl_next; + msk->first = NULL; + msk->dl_next = NULL; + + do_cancel_work = __mptcp_close(sk, 0); + unlock_sock_fast(sk, slow); + if (do_cancel_work) + mptcp_cancel_work(sk); + sock_put(sk); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -1663,10 +1790,12 @@ static int subflow_ulp_init(struct sock *sk) tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); - ctx->tcp_data_ready = sk->sk_data_ready; ctx->tcp_state_change = sk->sk_state_change; - ctx->tcp_write_space = sk->sk_write_space; ctx->tcp_error_report = sk->sk_error_report; + + WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); + WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); + sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; @@ -1721,9 +1850,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; - new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; new_ctx->tcp_state_change = old_ctx->tcp_state_change; - new_ctx->tcp_write_space = old_ctx->tcp_write_space; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; @@ -1737,15 +1864,22 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; + + /* this is the first subflow, id is always 0 */ + new_ctx->local_id_valid = 1; } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; new_ctx->backup = subflow_req->backup; - new_ctx->local_id = subflow_req->local_id; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; + + /* the subflow req id is valid, fetched via subflow_check_req() + * and subflow_token_join_request() + */ + subflow_set_local_id(new_ctx, subflow_req->local_id); } } @@ -1798,6 +1932,7 @@ void __init mptcp_subflow_init(void) subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; @@ -1810,6 +1945,7 @@ void __init mptcp_subflow_init(void) subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; + subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; @@ -1817,6 +1953,7 @@ void __init mptcp_subflow_init(void) subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.net_frag_header_len = 0; + subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index e581b341c5be..f52ee7b26aed 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -384,6 +384,7 @@ void mptcp_token_destroy(struct mptcp_sock *msk) bucket->chain_len--; } spin_unlock_bh(&bucket->lock); + WRITE_ONCE(msk->token, 0); } void __init mptcp_token_init(void) |