aboutsummaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/Makefile4
-rw-r--r--net/mptcp/bpf.c21
-rw-r--r--net/mptcp/ctrl.c21
-rw-r--r--net/mptcp/mib.c11
-rw-r--r--net/mptcp/mib.h13
-rw-r--r--net/mptcp/mptcp_diag.c108
-rw-r--r--net/mptcp/options.c187
-rw-r--r--net/mptcp/pm.c128
-rw-r--r--net/mptcp/pm_netlink.c644
-rw-r--r--net/mptcp/pm_userspace.c454
-rw-r--r--net/mptcp/protocol.c641
-rw-r--r--net/mptcp/protocol.h212
-rw-r--r--net/mptcp/sockopt.c42
-rw-r--r--net/mptcp/subflow.c331
14 files changed, 2127 insertions, 690 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index e54daceac58b..6e7df47c9584 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
- mib.o pm_netlink.o sockopt.o
+ mib.o pm_netlink.o sockopt.o pm_userspace.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
@@ -10,3 +10,5 @@ obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
mptcp_crypto_test-objs := crypto_test.o
mptcp_token_test-objs := token_test.o
obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o
+
+obj-$(CONFIG_BPF_SYSCALL) += bpf.o
diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
new file mode 100644
index 000000000000..5a0a84ad94af
--- /dev/null
+++ b/net/mptcp/bpf.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Tessares SA.
+ * Copyright (c) 2022, SUSE.
+ *
+ * Author: Nicolas Rybowski <nicolas.rybowski@tessares.net>
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/bpf.h>
+#include "protocol.h"
+
+struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
+{
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk))
+ return mptcp_sk(mptcp_subflow_ctx(sk)->conn);
+
+ return NULL;
+}
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 8b235468c88f..ae20b7d92e28 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -16,6 +16,11 @@
#define MPTCP_SYSCTL_PATH "net/mptcp"
static int mptcp_pernet_id;
+
+#ifdef CONFIG_SYSCTL
+static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX;
+#endif
+
struct mptcp_pernet {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *ctl_table_hdr;
@@ -26,6 +31,7 @@ struct mptcp_pernet {
u8 mptcp_enabled;
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
+ u8 pm_type;
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -58,6 +64,11 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net)
return mptcp_get_pernet(net)->stale_loss_cnt;
}
+int mptcp_get_pm_type(const struct net *net)
+{
+ return mptcp_get_pernet(net)->pm_type;
+}
+
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
@@ -65,6 +76,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->checksum_enabled = 0;
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
+ pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
}
#ifdef CONFIG_SYSCTL
@@ -108,6 +120,14 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
},
+ {
+ .procname = "pm_type",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &mptcp_pm_type_max
+ },
{}
};
@@ -128,6 +148,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[2].data = &pernet->checksum_enabled;
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
+ table[5].data = &pernet->pm_type;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 3240b72271a7..0dac2863c6e1 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -24,6 +24,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX),
SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR),
@@ -35,20 +36,30 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD),
+ SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP),
SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX),
SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX),
SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX),
SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX),
SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX),
SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
+ SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP),
SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX),
SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX),
+ SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX),
+ SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX),
+ SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX),
+ SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX),
SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE),
SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER),
+ SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
+ SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED),
+ SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE),
+ SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index ecd3d8b117e0..2be3596374f4 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -17,6 +17,7 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */
MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */
MPTCP_MIB_DATACSUMERR, /* The data checksum fail */
@@ -28,20 +29,32 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */
+ MPTCP_MIB_ADDADDRDROP, /* Dropped incoming ADD_ADDR */
MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */
MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */
MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */
MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */
MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */
MPTCP_MIB_RMADDR, /* Received RM_ADDR */
+ MPTCP_MIB_RMADDRDROP, /* Dropped incoming RM_ADDR */
MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */
MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */
+ MPTCP_MIB_MPFASTCLOSETX, /* Transmit a MP_FASTCLOSE */
+ MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */
+ MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */
+ MPTCP_MIB_MPRSTRX, /* Received a MP_RST */
MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */
MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */
MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */
+ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
+ MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */
+ MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to
+ * conflict with another subflow while updating msk rcv wnd
+ */
+ MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index f44125dd6697..8df1bdb647e2 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -66,20 +66,106 @@ out_nosk:
return err;
}
+struct mptcp_diag_ctx {
+ long s_slot;
+ long s_num;
+ unsigned int l_slot;
+ unsigned int l_num;
+};
+
+static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ bool net_admin)
+{
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
+ struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ struct net *net = sock_net(skb->sk);
+ struct inet_hashinfo *hinfo;
+ int i;
+
+ hinfo = net->ipv4.tcp_death_row.hashinfo;
+
+ for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int num = 0;
+
+ ilb = &hinfo->lhash2[i];
+
+ rcu_read_lock();
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ const struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ int ret;
+
+ if (num < diag_ctx->l_num)
+ goto next_listen;
+
+ if (!ctx || strcmp(inet_csk(sk)->icsk_ulp_ops->name, "mptcp"))
+ goto next_listen;
+
+ sk = ctx->conn;
+ if (!sk || !net_eq(sock_net(sk), net))
+ goto next_listen;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_listen;
+
+ ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+
+ sock_put(sk);
+
+ if (ret < 0) {
+ spin_unlock(&ilb->lock);
+ rcu_read_unlock();
+ diag_ctx->l_slot = i;
+ diag_ctx->l_num = num;
+ return;
+ }
+ diag_ctx->l_num = num + 1;
+ num = 0;
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+ rcu_read_unlock();
+
+ cond_resched();
+ diag_ctx->l_num = 0;
+ }
+
+ diag_ctx->l_num = 0;
+ diag_ctx->l_slot = i;
+}
+
static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
struct inet_diag_dump_data *cb_data;
struct mptcp_sock *msk;
struct nlattr *bc;
+ BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx));
+
cb_data = cb->data;
bc = cb_data->inet_diag_nla_bc;
- while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) !=
- NULL) {
+ while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot,
+ &diag_ctx->s_num)) != NULL) {
struct inet_sock *inet = (struct inet_sock *)msk;
struct sock *sk = (struct sock *)msk;
int ret = 0;
@@ -101,11 +187,14 @@ next:
sock_put(sk);
if (ret < 0) {
/* will retry on the same position */
- cb->args[1]--;
+ diag_ctx->s_num--;
break;
}
cond_resched();
}
+
+ if ((r->idiag_states & TCPF_LISTEN) && r->id.idiag_dport == 0)
+ mptcp_diag_dump_listeners(skb, cb, r, net_admin);
}
static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -116,6 +205,19 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
r->idiag_rqueue = sk_rmem_alloc_get(sk);
r->idiag_wqueue = sk_wmem_alloc_get(sk);
+
+ if (inet_sk_state_load(sk) == TCP_LISTEN) {
+ struct sock *lsk = READ_ONCE(msk->first);
+
+ if (lsk) {
+ /* override with settings from tcp listener,
+ * so Send-Q will show accept queue.
+ */
+ r->idiag_rqueue = READ_ONCE(lsk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(lsk->sk_max_ack_backlog);
+ }
+ }
+
if (!info)
return;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 645dd984fef0..30d289044e71 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -107,7 +107,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
ptr += 2;
}
if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) {
- mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr);
+ mp_opt->csum = get_unaligned((__force __sum16 *)ptr);
mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
ptr += 2;
}
@@ -221,7 +221,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) {
mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
- mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr);
+ mp_opt->csum = get_unaligned((__force __sum16 *)ptr);
ptr += 2;
}
@@ -323,6 +323,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
mp_opt->rcvr_key = get_unaligned_be64(ptr);
ptr += 8;
mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE;
+ pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key);
break;
case MPTCPOPT_RST:
@@ -336,6 +337,8 @@ static void mptcp_parse_option(const struct sk_buff *skb,
flags = *ptr++;
mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
mp_opt->reset_reason = *ptr;
+ pr_debug("MP_RST: transient=%u reason=%u",
+ mp_opt->reset_transient, mp_opt->reset_reason);
break;
case MPTCPOPT_MP_FAIL:
@@ -353,8 +356,7 @@ static void mptcp_parse_option(const struct sk_buff *skb,
}
}
-void mptcp_get_options(const struct sock *sk,
- const struct sk_buff *skb,
+void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt)
{
const struct tcphdr *th = tcp_hdr(skb);
@@ -651,7 +653,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
bool drop_other_suboptions = false;
unsigned int opt_size = *size;
bool echo;
- bool port;
int len;
/* add addr will strip the existing options, be sure to avoid breaking
@@ -660,12 +661,12 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
!mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
- &echo, &port, &drop_other_suboptions))
+ &echo, &drop_other_suboptions))
return false;
if (drop_other_suboptions)
remaining += opt_size;
- len = mptcp_add_addr_len(opts->addr.family, echo, port);
+ len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
if (remaining < len)
return false;
@@ -764,6 +765,7 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu
opts->suboptions |= OPTION_MPTCP_RST;
opts->reset_transient = subflow->reset_transient;
opts->reset_reason = subflow->reset_reason;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);
return true;
}
@@ -787,6 +789,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk,
opts->rcvr_key = msk->remote_key;
pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
return true;
}
@@ -808,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk,
opts->fail_seq = subflow->map_seq;
pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);
return true;
}
@@ -824,7 +828,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0;
- if (unlikely(__mptcp_check_fallback(msk)))
+ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb)))
return false;
if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
@@ -928,7 +932,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) &&
- READ_ONCE(msk->pm.server_side))
+ !subflow->request_join)
tcp_send_ack(ssk);
goto fully_established;
}
@@ -963,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
goto reset;
subflow->mp_capable = 0;
pr_fallback(msk);
- __mptcp_do_fallback(msk);
+ mptcp_do_fallback(ssk);
return false;
}
@@ -1084,8 +1088,7 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
&mp_opt->addr);
pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
- msk, (unsigned long long)hmac,
- (unsigned long long)mp_opt->ahmac);
+ msk, hmac, mp_opt->ahmac);
return hmac == mp_opt->ahmac;
}
@@ -1112,7 +1115,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true;
}
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
/* The subflow can be in close state only if check_fully_established()
* just sent a reset. If so, tell the caller to ignore the current packet.
@@ -1125,12 +1128,13 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
msk->local_key == mp_opt.rcvr_key) {
WRITE_ONCE(msk->rcv_fastclose, true);
mptcp_schedule_work((struct sock *)msk);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX);
}
if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) &&
add_addr_hmac_valid(msk, &mp_opt)) {
if (!mp_opt.echo) {
- mptcp_pm_add_addr_received(msk, &mp_opt.addr);
+ mptcp_pm_add_addr_received(sk, &mp_opt.addr);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
} else {
mptcp_pm_add_addr_echoed(msk, &mp_opt.addr);
@@ -1159,6 +1163,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
subflow->reset_seen = 1;
subflow->reset_reason = mp_opt.reset_reason;
subflow->reset_transient = mp_opt.reset_transient;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX);
}
if (!(mp_opt.suboptions & OPTION_MPTCP_DSS))
@@ -1220,23 +1225,65 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
return true;
}
-static void mptcp_set_rwin(const struct tcp_sock *tp)
+static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
{
const struct sock *ssk = (const struct sock *)tp;
- const struct mptcp_subflow_context *subflow;
+ struct mptcp_subflow_context *subflow;
+ u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
struct mptcp_sock *msk;
- u64 ack_seq;
+ u32 new_win;
+ u64 win;
subflow = mptcp_subflow_ctx(ssk);
msk = mptcp_sk(subflow->conn);
- ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
+ ack_seq = READ_ONCE(msk->ack_seq);
+ rcv_wnd_new = ack_seq + tp->rcv_wnd;
+
+ rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
+ if (after64(rcv_wnd_new, rcv_wnd_old)) {
+ u64 rcv_wnd;
+
+ for (;;) {
+ rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
+
+ if (rcv_wnd == rcv_wnd_old)
+ break;
+ if (before64(rcv_wnd_new, rcv_wnd)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
+ goto raise_win;
+ }
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
+ rcv_wnd_old = rcv_wnd;
+ }
+ return;
+ }
+
+ if (rcv_wnd_new != rcv_wnd_old) {
+raise_win:
+ win = rcv_wnd_old - ack_seq;
+ tp->rcv_wnd = min_t(u64, win, U32_MAX);
+ new_win = tp->rcv_wnd;
- if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
- WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+ /* Make sure we do not exceed the maximum possible
+ * scaled window.
+ */
+ if (unlikely(th->syn))
+ new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
+ if (!tp->rx_opt.rcv_wscale &&
+ READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows))
+ new_win = min(new_win, MAX_TCP_WINDOW);
+ else
+ new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rx_opt.rcv_wscale;
+ th->window = htons(new_win);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
+ }
}
-u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
+__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
{
struct csum_pseudo_header header;
__wsum csum;
@@ -1252,34 +1299,52 @@ u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
header.csum = 0;
csum = csum_partial(&header, sizeof(header), sum);
- return (__force u16)csum_fold(csum);
+ return csum_fold(csum);
}
-static u16 mptcp_make_csum(const struct mptcp_ext *mpext)
+static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext)
{
return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len,
~csum_unfold(mpext->csum));
}
-void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
- struct mptcp_out_options *opts)
+static void put_len_csum(u16 len, __sum16 csum, void *data)
{
- if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
- const struct sock *ssk = (const struct sock *)tp;
- struct mptcp_subflow_context *subflow;
+ __sum16 *sumptr = data + 2;
+ __be16 *ptr = data;
- subflow = mptcp_subflow_ctx(ssk);
- subflow->send_mp_fail = 0;
+ put_unaligned_be16(len, ptr);
- *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
- TCPOLEN_MPTCP_FAIL,
- 0, 0);
- put_unaligned_be64(opts->fail_seq, ptr);
- ptr += 2;
- }
+ put_unaligned(csum, sumptr);
+}
- /* DSS, MPC, MPJ, ADD_ADDR, FASTCLOSE and RST are mutually exclusive,
- * see mptcp_established_options*()
+void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
+ struct mptcp_out_options *opts)
+{
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+
+ /* Which options can be used together?
+ *
+ * X: mutually exclusive
+ * O: often used together
+ * C: can be used together in some cases
+ * P: could be used together but we prefer not to (optimisations)
+ *
+ * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC |
+ * ------|------|------|------|------|------|------|------|------|
+ * MPC |------|------|------|------|------|------|------|------|
+ * MPJ | X |------|------|------|------|------|------|------|
+ * DSS | X | X |------|------|------|------|------|------|
+ * ADD | X | X | P |------|------|------|------|------|
+ * RM | C | C | C | P |------|------|------|------|
+ * PRIO | X | C | C | C | C |------|------|------|
+ * FAIL | X | X | C | X | X | X |------|------|
+ * FC | X | X | X | X | X | X | X |------|
+ * RST | X | X | X | X | X | X | O | O |
+ * ------|------|------|------|------|------|------|------|------|
+ *
+ * The same applies in mptcp_established_options() function.
*/
if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
struct mptcp_ext *mpext = &opts->ext_copy;
@@ -1328,14 +1393,22 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
put_unaligned_be32(mpext->subflow_seq, ptr);
ptr += 1;
if (opts->csum_reqd) {
- put_unaligned_be32(mpext->data_len << 16 |
- mptcp_make_csum(mpext), ptr);
+ /* data_len == 0 is reserved for the infinite mapping,
+ * the checksum will also be set to 0.
+ */
+ put_len_csum(mpext->data_len,
+ (mpext->data_len ? mptcp_make_csum(mpext) : 0),
+ ptr);
} else {
put_unaligned_be32(mpext->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
}
ptr += 1;
}
+
+ /* We might need to add MP_FAIL options in rare cases */
+ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions))
+ goto mp_fail;
} else if (OPTIONS_MPTCP_MPC & opts->suboptions) {
u8 len, flag = MPTCP_CAP_HMAC_SHA256;
@@ -1376,11 +1449,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
goto mp_capable_done;
if (opts->csum_reqd) {
- put_unaligned_be32(opts->data_len << 16 |
- __mptcp_make_csum(opts->data_seq,
- opts->subflow_seq,
- opts->data_len,
- ~csum_unfold(opts->csum)), ptr);
+ put_len_csum(opts->data_len,
+ __mptcp_make_csum(opts->data_seq,
+ opts->subflow_seq,
+ opts->data_len,
+ ~csum_unfold(opts->csum)),
+ ptr);
} else {
put_unaligned_be32(opts->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
@@ -1479,6 +1553,21 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
if (OPTION_MPTCP_RST & opts->suboptions)
goto mp_rst;
return;
+ } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
+mp_fail:
+ /* MP_FAIL is mutually exclusive with others except RST */
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->send_mp_fail = 0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
+ TCPOLEN_MPTCP_FAIL,
+ 0, 0);
+ put_unaligned_be64(opts->fail_seq, ptr);
+ ptr += 2;
+
+ if (OPTION_MPTCP_RST & opts->suboptions)
+ goto mp_rst;
+ return;
} else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
mp_rst:
*ptr++ = mptcp_option(MPTCPOPT_RST,
@@ -1489,15 +1578,15 @@ mp_rst:
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
- const struct sock *ssk = (const struct sock *)tp;
- struct mptcp_subflow_context *subflow;
-
subflow = mptcp_subflow_ctx(ssk);
subflow->send_mp_prio = 0;
*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
TCPOLEN_MPTCP_PRIO,
opts->backup, TCPOPT_NOP);
+
+ MPTCP_INC_STATS(sock_net((const struct sock *)tp),
+ MPTCP_MIB_MPPRIOTX);
}
mp_capable_done:
@@ -1522,7 +1611,7 @@ mp_capable_done:
}
if (tp)
- mptcp_set_rwin(tp);
+ mptcp_set_rwin(tp, th);
}
__be32 mptcp_get_reset_option(const struct sk_buff *skb)
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 696b2c4613a7..45e2a48397b9 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -87,6 +87,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
unsigned int subflows_max;
int ret = 0;
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_active(msk);
+
subflows_max = mptcp_pm_get_subflows_max(msk);
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
@@ -178,14 +181,14 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
struct mptcp_pm_data *pm = &msk->pm;
bool update_subflows;
- update_subflows = (ssk->sk_state == TCP_CLOSE) &&
- (subflow->request_join || subflow->mp_join);
+ update_subflows = (subflow->request_join || subflow->mp_join) &&
+ mptcp_pm_is_kernel(msk);
if (!READ_ONCE(pm->work_pending) && !update_subflows)
return;
spin_lock_bh(&pm->lock);
if (update_subflows)
- pm->subflows--;
+ __mptcp_pm_close_subflow(msk);
/* Even if this subflow is not really established, tell the PM to try
* to pick the next ones, if possible.
@@ -196,30 +199,41 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
spin_unlock_bh(&pm->lock);
}
-void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+void mptcp_pm_add_addr_received(const struct sock *ssk,
const struct mptcp_addr_info *addr)
{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
READ_ONCE(pm->accept_addr));
- mptcp_event_addr_announced(msk, addr);
+ mptcp_event_addr_announced(ssk, addr);
spin_lock_bh(&pm->lock);
- if (!READ_ONCE(pm->accept_addr)) {
+ if (mptcp_pm_is_userspace(msk)) {
+ if (mptcp_userspace_pm_active(msk)) {
+ mptcp_pm_announce_addr(msk, addr, true);
+ mptcp_pm_add_addr_send_ack(msk);
+ } else {
+ __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
+ }
+ } else if (!READ_ONCE(pm->accept_addr)) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
pm->remote = *addr;
+ } else {
+ __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
}
spin_unlock_bh(&pm->lock);
}
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_data *pm = &msk->pm;
@@ -253,36 +267,67 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
mptcp_event_addr_removed(msk, rm_list->ids[i]);
spin_lock_bh(&pm->lock);
- mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED);
- pm->rm_list_rx = *rm_list;
+ if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED))
+ pm->rm_list_rx = *rm_list;
+ else
+ __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP);
spin_unlock_bh(&pm->lock);
}
-void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup)
+void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+ struct mptcp_sock *msk;
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
- subflow->backup = bkup;
+ msk = mptcp_sk(sk);
+ if (subflow->backup != bkup) {
+ subflow->backup = bkup;
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ msk->last_snd = NULL;
+ else
+ __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
+ mptcp_data_unlock(sk);
+ }
- mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC);
+ mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
}
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
pr_debug("fail_seq=%llu", fail_seq);
+
+ if (!READ_ONCE(msk->allow_infinite_fallback))
+ return;
+
+ if (!subflow->fail_tout) {
+ pr_debug("send MP_FAIL response and infinite map");
+
+ subflow->send_mp_fail = 1;
+ subflow->send_infinite_map = 1;
+ tcp_send_ack(sk);
+ } else {
+ pr_debug("MP_FAIL response received");
+ WRITE_ONCE(subflow->fail_tout, 0);
+ }
}
/* path manager helpers */
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
unsigned int opt_size, unsigned int remaining,
struct mptcp_addr_info *addr, bool *echo,
- bool *port, bool *drop_other_suboptions)
+ bool *drop_other_suboptions)
{
int ret = false;
u8 add_addr;
u8 family;
+ bool port;
spin_lock_bh(&msk->pm.lock);
@@ -300,10 +345,10 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
}
*echo = mptcp_pm_should_add_signal_echo(msk);
- *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
+ port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
family = *echo ? msk->pm.remote.family : msk->pm.local.family;
- if (remaining < mptcp_add_addr_len(family, *echo, *port))
+ if (remaining < mptcp_add_addr_len(family, *echo, port))
goto out_unlock;
if (*echo) {
@@ -377,27 +422,48 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
void mptcp_pm_data_reset(struct mptcp_sock *msk)
{
- msk->pm.add_addr_signaled = 0;
- msk->pm.add_addr_accepted = 0;
- msk->pm.local_addr_used = 0;
- msk->pm.subflows = 0;
- msk->pm.rm_list_tx.nr = 0;
- msk->pm.rm_list_rx.nr = 0;
- WRITE_ONCE(msk->pm.work_pending, false);
- WRITE_ONCE(msk->pm.addr_signal, 0);
- WRITE_ONCE(msk->pm.accept_addr, false);
- WRITE_ONCE(msk->pm.accept_subflow, false);
- WRITE_ONCE(msk->pm.remote_deny_join_id0, false);
- msk->pm.status = 0;
- bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
+ struct mptcp_pm_data *pm = &msk->pm;
- mptcp_pm_nl_data_init(msk);
+ pm->add_addr_signaled = 0;
+ pm->add_addr_accepted = 0;
+ pm->local_addr_used = 0;
+ pm->subflows = 0;
+ pm->rm_list_tx.nr = 0;
+ pm->rm_list_rx.nr = 0;
+ WRITE_ONCE(pm->pm_type, pm_type);
+
+ if (pm_type == MPTCP_PM_TYPE_KERNEL) {
+ bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
+
+ /* pm->work_pending must be only be set to 'true' when
+ * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
+ */
+ WRITE_ONCE(pm->work_pending,
+ (!!mptcp_pm_get_local_addr_max(msk) &&
+ subflows_allowed) ||
+ !!mptcp_pm_get_add_addr_signal_max(msk));
+ WRITE_ONCE(pm->accept_addr,
+ !!mptcp_pm_get_add_addr_accept_max(msk) &&
+ subflows_allowed);
+ WRITE_ONCE(pm->accept_subflow, subflows_allowed);
+ } else {
+ WRITE_ONCE(pm->work_pending, 0);
+ WRITE_ONCE(pm->accept_addr, 0);
+ WRITE_ONCE(pm->accept_subflow, 0);
+ }
+
+ WRITE_ONCE(pm->addr_signal, 0);
+ WRITE_ONCE(pm->remote_deny_join_id0, false);
+ pm->status = 0;
+ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
spin_lock_init(&msk->pm.lock);
INIT_LIST_HEAD(&msk->pm.anno_list);
+ INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list);
mptcp_pm_data_reset(msk);
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 75af1f701e1d..9813ed0fde9b 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -22,14 +22,6 @@ static struct genl_family mptcp_genl_family;
static int pm_nl_pernet_id;
-struct mptcp_pm_addr_entry {
- struct list_head list;
- struct mptcp_addr_info addr;
- u8 flags;
- int ifindex;
- struct socket *lsk;
-};
-
struct mptcp_pm_add_entry {
struct list_head list;
struct mptcp_addr_info addr;
@@ -55,8 +47,19 @@ struct pm_nl_pernet {
#define MPTCP_PM_ADDR_MAX 8
#define ADD_ADDR_RETRANS_MAX 3
-static bool addresses_equal(const struct mptcp_addr_info *a,
- const struct mptcp_addr_info *b, bool use_port)
+static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net)
+{
+ return net_generic(net, pm_nl_pernet_id);
+}
+
+static struct pm_nl_pernet *
+pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk)
+{
+ return pm_nl_get_pernet(sock_net((struct sock *)msk));
+}
+
+bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
+ const struct mptcp_addr_info *b, bool use_port)
{
bool addr_equals = false;
@@ -83,16 +86,6 @@ static bool addresses_equal(const struct mptcp_addr_info *a,
return a->port == b->port;
}
-static bool address_zero(const struct mptcp_addr_info *addr)
-{
- struct mptcp_addr_info zero;
-
- memset(&zero, 0, sizeof(zero));
- zero.family = addr->family;
-
- return addresses_equal(addr, &zero, true);
-}
-
static void local_address(const struct sock_common *skc,
struct mptcp_addr_info *addr)
{
@@ -120,7 +113,7 @@ static void remote_address(const struct sock_common *skc,
}
static bool lookup_subflow_by_saddr(const struct list_head *list,
- struct mptcp_addr_info *saddr)
+ const struct mptcp_addr_info *saddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
@@ -130,7 +123,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list,
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
local_address(skc, &cur);
- if (addresses_equal(&cur, saddr, saddr->port))
+ if (mptcp_addresses_equal(&cur, saddr, saddr->port))
return true;
}
@@ -138,7 +131,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list,
}
static bool lookup_subflow_by_daddr(const struct list_head *list,
- struct mptcp_addr_info *daddr)
+ const struct mptcp_addr_info *daddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
@@ -148,7 +141,7 @@ static bool lookup_subflow_by_daddr(const struct list_head *list,
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
remote_address(skc, &cur);
- if (addresses_equal(&cur, daddr, daddr->port))
+ if (mptcp_addresses_equal(&cur, daddr, daddr->port))
return true;
}
@@ -157,10 +150,10 @@ static bool lookup_subflow_by_daddr(const struct list_head *list,
static struct mptcp_pm_addr_entry *
select_local_address(const struct pm_nl_pernet *pernet,
- struct mptcp_sock *msk)
+ const struct mptcp_sock *msk)
{
+ const struct sock *sk = (const struct sock *)msk;
struct mptcp_pm_addr_entry *entry, *ret = NULL;
- struct sock *sk = (struct sock *)msk;
msk_owned_by_me(msk);
@@ -190,7 +183,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
}
static struct mptcp_pm_addr_entry *
-select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
+select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
@@ -214,45 +207,41 @@ select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk)
return ret;
}
-unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
{
- struct pm_nl_pernet *pernet;
+ const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
return READ_ONCE(pernet->add_addr_signal_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
-unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
{
- struct pm_nl_pernet *pernet;
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
return READ_ONCE(pernet->add_addr_accept_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
-unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
{
- struct pm_nl_pernet *pernet;
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
return READ_ONCE(pernet->subflows_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
-unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk)
+unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
{
- struct pm_nl_pernet *pernet;
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
return READ_ONCE(pernet->local_addr_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{
- struct pm_nl_pernet *pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
(find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
@@ -264,15 +253,15 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
}
struct mptcp_pm_add_entry *
-mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
lockdep_assert_held(&msk->pm.lock);
list_for_each_entry(entry, &msk->pm.anno_list, list) {
- if (addresses_equal(&entry->addr, addr, true))
+ if (mptcp_addresses_equal(&entry->addr, addr, true))
return entry;
}
@@ -289,7 +278,7 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
spin_lock_bh(&msk->pm.lock);
list_for_each_entry(entry, &msk->pm.anno_list, list) {
- if (addresses_equal(&entry->addr, &saddr, true)) {
+ if (mptcp_addresses_equal(&entry->addr, &saddr, true)) {
ret = true;
goto out;
}
@@ -346,7 +335,7 @@ out:
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr, bool check_id)
+ const struct mptcp_addr_info *addr, bool check_id)
{
struct mptcp_pm_add_entry *entry;
struct sock *sk = (struct sock *)msk;
@@ -363,8 +352,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
return entry;
}
-static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry)
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ const struct mptcp_pm_addr_entry *entry)
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
@@ -372,8 +361,16 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
lockdep_assert_held(&msk->pm.lock);
- if (mptcp_lookup_anno_list_by_saddr(msk, &entry->addr))
- return false;
+ add_entry = mptcp_lookup_anno_list_by_saddr(msk, &entry->addr);
+
+ if (add_entry) {
+ if (mptcp_pm_is_kernel(msk))
+ return false;
+
+ sk_reset_timer(sk, &add_entry->add_timer,
+ jiffies + mptcp_get_add_addr_timeout(net));
+ return true;
+ }
add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC);
if (!add_entry)
@@ -410,13 +407,13 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
}
}
-static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr,
- struct mptcp_addr_info *addr)
+static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr,
+ const struct mptcp_addr_info *addr)
{
int i;
for (i = 0; i < nr; i++) {
- if (addresses_equal(&addrs[i], addr, addr->port))
+ if (addrs[i].id == addr->id)
return true;
}
@@ -452,7 +449,8 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
remote_address((struct sock_common *)ssk, &addrs[i]);
- if (deny_id0 && addresses_equal(&addrs[i], &remote, false))
+ addrs[i].id = subflow->remote_id;
+ if (deny_id0 && !addrs[i].id)
continue;
if (!lookup_address_in_vec(addrs, i, &addrs[i]) &&
@@ -466,6 +464,37 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm
return i;
}
+static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ bool prio, bool backup)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow;
+
+ pr_debug("send ack for %s",
+ prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"));
+
+ slow = lock_sock_fast(ssk);
+ if (prio) {
+ if (subflow->backup != backup)
+ msk->last_snd = NULL;
+
+ subflow->send_mp_prio = 1;
+ subflow->backup = backup;
+ subflow->request_bkup = backup;
+ }
+
+ __mptcp_subflow_send_ack(ssk);
+ unlock_sock_fast(ssk, slow);
+}
+
+static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
+ bool prio, bool backup)
+{
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_pm_send_ack(msk, subflow, prio, backup);
+ spin_lock_bh(&msk->pm.lock);
+}
+
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
@@ -478,21 +507,19 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
return NULL;
}
-static int
-lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr)
+static struct mptcp_pm_addr_entry *
+__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info,
+ bool lookup_by_id)
{
struct mptcp_pm_addr_entry *entry;
- int ret = -1;
- rcu_read_lock();
list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if (addresses_equal(&entry->addr, addr, entry->addr.port)) {
- ret = entry->addr.id;
- break;
- }
+ if ((!lookup_by_id &&
+ mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) ||
+ (lookup_by_id && entry->addr.id == info->id))
+ return entry;
}
- rcu_read_unlock();
- return ret;
+ return NULL;
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
@@ -504,7 +531,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
struct pm_nl_pernet *pernet;
unsigned int subflows_max;
- pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
+ pernet = pm_nl_get_pernet(sock_net(sk));
add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
local_addr_max = mptcp_pm_get_local_addr_max(msk);
@@ -512,13 +539,23 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
/* do lazy endpoint usage accounting for the MPC subflows */
if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first);
+ struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
- int mpc_id;
+ bool backup = false;
local_address((struct sock_common *)msk->first, &mpc_addr);
- mpc_id = lookup_id_by_addr(pernet, &mpc_addr);
- if (mpc_id >= 0)
- __clear_bit(mpc_id, msk->pm.id_avail_bitmap);
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, &mpc_addr, false);
+ if (entry) {
+ __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
+ msk->mpc_endpoint_id = entry->addr.id;
+ backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ }
+ rcu_read_unlock();
+
+ if (backup)
+ mptcp_pm_send_ack(msk, subflow, true, backup);
msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
}
@@ -532,6 +569,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
if (msk->pm.add_addr_signaled < add_addr_signal_max) {
local = select_signal_address(pernet, msk);
+ /* due to racing events on both ends we can reach here while
+ * previous add address is still running: if we invoke now
+ * mptcp_pm_announce_addr(), that will fail and the
+ * corresponding id will be marked as used.
+ * Instead let the PM machinery reschedule us when the
+ * current address announce will be completed.
+ */
+ if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL))
+ return;
+
if (local) {
if (mptcp_pm_alloc_anno_list(msk, local)) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
@@ -590,7 +637,7 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
unsigned int subflows_max;
int i = 0;
- pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
+ pernet = pm_nl_get_pernet_from_msk(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
rcu_read_lock();
@@ -645,15 +692,20 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
msk->pm.add_addr_accepted, add_addr_accept_max,
msk->pm.remote.family);
- if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote))
- goto add_addr_echo;
+ remote = msk->pm.remote;
+ mptcp_pm_announce_addr(msk, &remote, true);
+ mptcp_pm_nl_addr_send_ack(msk);
+
+ if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
+ return;
+
+ /* pick id 0 port, if none is provided the remote address */
+ if (!remote.port)
+ remote.port = sk->sk_dport;
/* connect to the specified remote address, using whatever
* local address the routing configuration will pick.
*/
- remote = msk->pm.remote;
- if (!remote.port)
- remote.port = sk->sk_dport;
nr = fill_local_addresses_vec(msk, addrs);
msk->pm.add_addr_accepted++;
@@ -665,10 +717,6 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
for (i = 0; i < nr; i++)
__mptcp_subflow_connect(sk, &addrs[i], &remote);
spin_lock_bh(&msk->pm.lock);
-
-add_addr_echo:
- mptcp_pm_announce_addr(msk, &msk->pm.remote, true);
- mptcp_pm_nl_addr_send_ack(msk);
}
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
@@ -683,21 +731,14 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk)
return;
subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node);
- if (subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- spin_unlock_bh(&msk->pm.lock);
- pr_debug("send ack for %s",
- mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr");
-
- mptcp_subflow_send_ack(ssk);
- spin_lock_bh(&msk->pm.lock);
- }
+ if (subflow)
+ mptcp_pm_send_ack(msk, subflow, false, false);
}
-static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr,
- u8 bkup)
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup)
{
struct mptcp_subflow_context *subflow;
@@ -705,29 +746,30 @@ static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- struct sock *sk = (struct sock *)msk;
- struct mptcp_addr_info local;
+ struct mptcp_addr_info local, remote;
local_address((struct sock_common *)ssk, &local);
- if (!addresses_equal(&local, addr, addr->port))
+ if (!mptcp_addresses_equal(&local, addr, addr->port))
continue;
- subflow->backup = bkup;
- subflow->send_mp_prio = 1;
- subflow->request_bkup = bkup;
- __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX);
-
- spin_unlock_bh(&msk->pm.lock);
- pr_debug("send ack for mp_prio");
- mptcp_subflow_send_ack(ssk);
- spin_lock_bh(&msk->pm.lock);
+ if (rem && rem->family != AF_UNSPEC) {
+ remote_address((struct sock_common *)ssk, &remote);
+ if (!mptcp_addresses_equal(&remote, rem, rem->port))
+ continue;
+ }
+ __mptcp_pm_send_ack(msk, subflow, true, bkup);
return 0;
}
return -EINVAL;
}
+static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id)
+{
+ return local_id == id || (!local_id && msk->mpc_endpoint_id == id);
+}
+
static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list,
enum linux_mptcp_mib_field rm_type)
@@ -751,22 +793,23 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
return;
for (i = 0; i < rm_list->nr; i++) {
+ u8 rm_id = rm_list->ids[i];
bool removed = false;
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int how = RCV_SHUTDOWN | SEND_SHUTDOWN;
u8 id = subflow->local_id;
- if (rm_type == MPTCP_MIB_RMADDR)
- id = subflow->remote_id;
-
- if (rm_list->ids[i] != id)
+ if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id)
+ continue;
+ if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id))
continue;
- pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u",
+ pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u",
rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow",
- i, rm_list->ids[i], subflow->local_id, subflow->remote_id);
+ i, rm_id, subflow->local_id, subflow->remote_id,
+ msk->mpc_endpoint_id);
spin_unlock_bh(&msk->pm.lock);
mptcp_subflow_shutdown(sk, ssk, how);
@@ -777,10 +820,14 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk,
removed = true;
__MPTCP_INC_STATS(sock_net(sk), rm_type);
}
- __set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap);
+ if (rm_type == MPTCP_MIB_RMSUBFLOW)
+ __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap);
if (!removed)
continue;
+ if (!mptcp_pm_is_kernel(msk))
+ continue;
+
if (rm_type == MPTCP_MIB_RMADDR) {
msk->pm.add_addr_accepted--;
WRITE_ONCE(msk->pm.accept_addr, true);
@@ -844,10 +891,18 @@ static bool address_use_port(struct mptcp_pm_addr_entry *entry)
MPTCP_PM_ADDR_FLAG_SIGNAL;
}
+/* caller must ensure the RCU grace period is already elapsed */
+static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
+{
+ if (entry->lsk)
+ sock_release(entry->lsk);
+ kfree(entry);
+}
+
static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry)
{
- struct mptcp_pm_addr_entry *cur;
+ struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
unsigned int addr_max;
int ret = -EINVAL;
@@ -865,11 +920,26 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
/* do not insert duplicate address, differentiate on port only
* singled addresses
*/
+ if (!address_use_port(entry))
+ entry->addr.port = 0;
list_for_each_entry(cur, &pernet->local_addr_list, list) {
- if (addresses_equal(&cur->addr, &entry->addr,
- address_use_port(entry) &&
- address_use_port(cur)))
- goto out;
+ if (mptcp_addresses_equal(&cur->addr, &entry->addr,
+ cur->addr.port || entry->addr.port)) {
+ /* allow replacing the exiting endpoint only if such
+ * endpoint is an implicit one and the user-space
+ * did not provide an endpoint id
+ */
+ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT))
+ goto out;
+ if (entry->addr.id)
+ goto out;
+
+ pernet->addrs--;
+ entry->addr.id = cur->addr.id;
+ list_del_rcu(&cur->list);
+ del_entry = cur;
+ break;
+ }
}
if (!entry->addr.id) {
@@ -900,17 +970,27 @@ find_next:
}
pernet->addrs++;
- list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ if (!entry->addr.port)
+ list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ else
+ list_add_rcu(&entry->list, &pernet->local_addr_list);
ret = entry->addr.id;
out:
spin_unlock_bh(&pernet->lock);
+
+ /* just replaced an existing entry, free it */
+ if (del_entry) {
+ synchronize_rcu();
+ __mptcp_pm_release_addr_entry(del_entry);
+ }
return ret;
}
static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
struct mptcp_pm_addr_entry *entry)
{
+ int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
struct mptcp_sock *msk;
struct socket *ssock;
@@ -935,8 +1015,11 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
}
mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
- err = kernel_bind(ssock, (struct sockaddr *)&addr,
- sizeof(struct sockaddr_in));
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (entry->addr.family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen);
if (err) {
pr_warn("kernel_bind error, err=%d", err);
goto out;
@@ -971,17 +1054,17 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
*/
local_address((struct sock_common *)msk, &msk_local);
local_address((struct sock_common *)skc, &skc_local);
- if (addresses_equal(&msk_local, &skc_local, false))
+ if (mptcp_addresses_equal(&msk_local, &skc_local, false))
return 0;
- if (address_zero(&skc_local))
- return 0;
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_get_local_id(msk, &skc_local);
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ pernet = pm_nl_get_pernet_from_msk(msk);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) {
+ if (mptcp_addresses_equal(&entry->addr, &skc_local, entry->addr.port)) {
ret = entry->addr.id;
break;
}
@@ -999,7 +1082,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
entry->addr.id = 0;
entry->addr.port = 0;
entry->ifindex = 0;
- entry->flags = 0;
+ entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
entry->lsk = NULL;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
@@ -1008,18 +1091,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return ret;
}
-void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
-{
- struct mptcp_pm_data *pm = &msk->pm;
- bool subflows;
-
- subflows = !!mptcp_pm_get_subflows_max(msk);
- WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) ||
- !!mptcp_pm_get_add_addr_signal_max(msk));
- WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows);
- WRITE_ONCE(pm->accept_subflow, subflows);
-}
-
#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1
@@ -1047,6 +1118,10 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
NLA_POLICY_NESTED(mptcp_pm_addr_policy),
[MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
[MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ATTR_ADDR_REMOTE] =
+ NLA_POLICY_NESTED(mptcp_pm_addr_policy),
};
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
@@ -1076,7 +1151,7 @@ void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ss
}
unlock_sock_fast(ssk, slow);
- /* always try to push the pending data regarless of re-injections:
+ /* always try to push the pending data regardless of re-injections:
* we can possibly use backup subflows now, and subflow selection
* is cheap under the msk socket lock
*/
@@ -1095,11 +1170,12 @@ static int mptcp_pm_family_to_addr(int family)
return MPTCP_PM_ADDR_ATTR_ADDR4;
}
-static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
- bool require_family,
- struct mptcp_pm_addr_entry *entry)
+static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[],
+ const struct nlattr *attr,
+ struct genl_info *info,
+ struct mptcp_addr_info *addr,
+ bool require_family)
{
- struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
int err, addr_addr;
if (!attr) {
@@ -1113,27 +1189,29 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
if (err)
return err;
- memset(entry, 0, sizeof(*entry));
+ if (tb[MPTCP_PM_ADDR_ATTR_ID])
+ addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
+
if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
if (!require_family)
- goto skip_family;
+ return err;
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing family");
return -EINVAL;
}
- entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
- if (entry->addr.family != AF_INET
+ addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
+ if (addr->family != AF_INET
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- && entry->addr.family != AF_INET6
+ && addr->family != AF_INET6
#endif
) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"unknown address family");
return -EINVAL;
}
- addr_addr = mptcp_pm_family_to_addr(entry->addr.family);
+ addr_addr = mptcp_pm_family_to_addr(addr->family);
if (!tb[addr_addr]) {
NL_SET_ERR_MSG_ATTR(info->extack, attr,
"missing address data");
@@ -1141,40 +1219,59 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (entry->addr.family == AF_INET6)
- entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]);
+ if (addr->family == AF_INET6)
+ addr->addr6 = nla_get_in6_addr(tb[addr_addr]);
else
#endif
- entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
+ addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
+ addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
+
+ return err;
+}
+
+int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ struct mptcp_addr_info *addr)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+
+ memset(addr, 0, sizeof(*addr));
+
+ return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true);
+}
+
+int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+ int err;
+
+ memset(entry, 0, sizeof(*entry));
+
+ err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family);
+ if (err)
+ return err;
-skip_family:
if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
entry->ifindex = val;
}
- if (tb[MPTCP_PM_ADDR_ATTR_ID])
- entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
-
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
- if (tb[MPTCP_PM_ADDR_ATTR_PORT]) {
- if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
- NL_SET_ERR_MSG_ATTR(info->extack, attr,
- "flags must have signal when using port");
- return -EINVAL;
- }
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
- }
return 0;
}
static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
{
- return net_generic(genl_info_net(info), pm_nl_pernet_id);
+ return pm_nl_get_pernet(genl_info_net(info));
}
static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
@@ -1185,7 +1282,8 @@ static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
- if (!READ_ONCE(msk->fully_established))
+ if (!READ_ONCE(msk->fully_established) ||
+ mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
@@ -1209,11 +1307,27 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
struct mptcp_pm_addr_entry addr, *entry;
int ret;
- ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ ret = mptcp_pm_parse_entry(attr, info, true, &addr);
if (ret < 0)
return ret;
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ GENL_SET_ERR_MSG(info, "flags must have signal when using port");
+ return -EINVAL;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL &&
+ addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) {
+ GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh");
+ return -EINVAL;
+ }
+
+ if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) {
+ GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint");
+ return -EINVAL;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
@@ -1242,17 +1356,25 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
+int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
u8 *flags, int *ifindex)
{
struct mptcp_pm_addr_entry *entry;
+ struct sock *sk = (struct sock *)msk;
+ struct net *net = sock_net(sk);
*flags = 0;
*ifindex = 0;
if (id) {
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk,
+ id,
+ flags,
+ ifindex);
+
rcu_read_lock();
- entry = __lookup_addr_by_id(net_generic(net, pm_nl_pernet_id), id);
+ entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id);
if (entry) {
*flags = entry->flags;
*ifindex = entry->ifindex;
@@ -1264,7 +1386,7 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
}
static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr)
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
@@ -1279,7 +1401,7 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
}
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr,
+ const struct mptcp_addr_info *addr,
bool force)
{
struct mptcp_rm_list list = { .nr = 0 };
@@ -1297,11 +1419,12 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
}
static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
- struct mptcp_addr_info *addr)
+ const struct mptcp_pm_addr_entry *entry)
{
- struct mptcp_sock *msk;
- long s_slot = 0, s_num = 0;
+ const struct mptcp_addr_info *addr = &entry->addr;
struct mptcp_rm_list list = { .nr = 0 };
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
pr_debug("remove_id=%d", addr->id);
@@ -1311,6 +1434,9 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
struct sock *sk = (struct sock *)msk;
bool remove_subflow;
+ if (mptcp_pm_is_userspace(msk))
+ goto next;
+
if (list_empty(&msk->conn_list)) {
mptcp_pm_remove_anno_addr(msk, addr, false);
goto next;
@@ -1318,7 +1444,8 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
lock_sock(sk);
remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
- mptcp_pm_remove_anno_addr(msk, addr, remove_subflow);
+ mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
+ !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
if (remove_subflow)
mptcp_pm_remove_subflow(msk, &list);
release_sock(sk);
@@ -1331,14 +1458,6 @@ next:
return 0;
}
-/* caller must ensure the RCU grace period is already elapsed */
-static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
-{
- if (entry->lsk)
- sock_release(entry->lsk);
- kfree(entry);
-}
-
static int mptcp_nl_remove_id_zero_address(struct net *net,
struct mptcp_addr_info *addr)
{
@@ -1352,11 +1471,11 @@ static int mptcp_nl_remove_id_zero_address(struct net *net,
struct sock *sk = (struct sock *)msk;
struct mptcp_addr_info msk_local;
- if (list_empty(&msk->conn_list))
+ if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
local_address((struct sock_common *)msk, &msk_local);
- if (!addresses_equal(&msk_local, addr, addr->port))
+ if (!mptcp_addresses_equal(&msk_local, addr, addr->port))
goto next;
lock_sock(sk);
@@ -1382,7 +1501,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
unsigned int addr_max;
int ret;
- ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
@@ -1415,29 +1534,27 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
__clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
- mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr);
+ mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry);
synchronize_rcu();
__mptcp_pm_release_addr_entry(entry);
return ret;
}
-static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
- struct list_head *rm_list)
+void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list)
{
struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 };
struct mptcp_pm_addr_entry *entry;
list_for_each_entry(entry, rm_list, list) {
if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
- alist.nr < MPTCP_RM_IDS_MAX &&
- slist.nr < MPTCP_RM_IDS_MAX) {
- alist.ids[alist.nr++] = entry->addr.id;
+ slist.nr < MPTCP_RM_IDS_MAX)
slist.ids[slist.nr++] = entry->addr.id;
- } else if (remove_anno_list_by_saddr(msk, &entry->addr) &&
- alist.nr < MPTCP_RM_IDS_MAX) {
+
+ if (remove_anno_list_by_saddr(msk, &entry->addr) &&
+ alist.nr < MPTCP_RM_IDS_MAX)
alist.ids[alist.nr++] = entry->addr.id;
- }
}
if (alist.nr) {
@@ -1461,9 +1578,11 @@ static void mptcp_nl_remove_addrs_list(struct net *net,
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
- lock_sock(sk);
- mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
- release_sock(sk);
+ if (!mptcp_pm_is_userspace(msk)) {
+ lock_sock(sk);
+ mptcp_pm_remove_addrs_and_subflows(msk, rm_list);
+ release_sock(sk);
+ }
sock_put(sk);
cond_resched();
@@ -1556,7 +1675,7 @@ static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
void *reply;
int ret;
- ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
@@ -1607,7 +1726,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
void *hdr;
int i;
- pernet = net_generic(net, pm_nl_pernet_id);
+ pernet = pm_nl_get_pernet(net);
spin_lock_bh(&pernet->lock);
for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) {
@@ -1714,9 +1833,22 @@ fail:
return -EMSGSIZE;
}
-static int mptcp_nl_addr_backup(struct net *net,
- struct mptcp_addr_info *addr,
- u8 bkup)
+static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr)
+{
+ struct mptcp_rm_list list = { .nr = 0 };
+
+ list.ids[list.nr++] = addr->id;
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_nl_rm_subflow_received(msk, &list);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
+static int mptcp_nl_set_flags(struct net *net,
+ struct mptcp_addr_info *addr,
+ u8 bkup, u8 changed)
{
long s_slot = 0, s_num = 0;
struct mptcp_sock *msk;
@@ -1725,13 +1857,14 @@ static int mptcp_nl_addr_backup(struct net *net,
while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
struct sock *sk = (struct sock *)msk;
- if (list_empty(&msk->conn_list))
+ if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk))
goto next;
lock_sock(sk);
- spin_lock_bh(&msk->pm.lock);
- ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
- spin_unlock_bh(&msk->pm.lock);
+ if (changed & MPTCP_PM_ADDR_FLAG_BACKUP)
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup);
+ if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)
+ mptcp_pm_nl_fullmesh(msk, addr);
release_sock(sk);
next:
@@ -1745,16 +1878,27 @@ next:
static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
{
struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry;
+ struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, };
+ struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP |
+ MPTCP_PM_ADDR_FLAG_FULLMESH;
struct net *net = sock_net(skb->sk);
u8 bkup = 0, lookup_by_id = 0;
int ret;
- ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
return ret;
+ if (attr_rem) {
+ ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote);
+ if (ret < 0)
+ return ret;
+ }
+
if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
bkup = 1;
if (addr.addr.family == AF_UNSPEC) {
@@ -1763,18 +1907,28 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
return -EOPNOTSUPP;
}
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if ((!lookup_by_id && addresses_equal(&entry->addr, &addr.addr, true)) ||
- (lookup_by_id && entry->addr.id == addr.addr.id)) {
- mptcp_nl_addr_backup(net, &entry->addr, bkup);
-
- if (bkup)
- entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
- else
- entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
- }
+ if (token)
+ return mptcp_userspace_pm_set_flags(sock_net(skb->sk),
+ token, &addr, &remote, bkup);
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr(pernet, &addr.addr, lookup_by_id);
+ if (!entry) {
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
+ }
+ if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) &&
+ (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ spin_unlock_bh(&pernet->lock);
+ return -EINVAL;
}
+ changed = (addr.flags ^ entry->flags) & mask;
+ entry->flags = (entry->flags & ~mask) | (addr.flags & mask);
+ addr = *entry;
+ spin_unlock_bh(&pernet->lock);
+
+ mptcp_nl_set_flags(net, &addr.addr, bkup, changed);
return 0;
}
@@ -1784,6 +1938,13 @@ static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gf
nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp);
}
+bool mptcp_userspace_pm_active(const struct mptcp_sock *msk)
+{
+ return genl_has_listeners(&mptcp_genl_family,
+ sock_net((const struct sock *)msk),
+ MPTCP_PM_EV_GRP_OFFSET);
+}
+
static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk)
{
const struct inet_sock *issk = inet_sk(ssk);
@@ -1904,6 +2065,9 @@ static int mptcp_event_created(struct sk_buff *skb,
if (err)
return err;
+ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
+ return -EMSGSIZE;
+
return mptcp_event_add_subflow(skb, ssk);
}
@@ -1938,10 +2102,12 @@ nla_put_failure:
kfree_skb(skb);
}
-void mptcp_event_addr_announced(const struct mptcp_sock *msk,
+void mptcp_event_addr_announced(const struct sock *ssk,
const struct mptcp_addr_info *info)
{
- struct net *net = sock_net((const struct sock *)msk);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct net *net = sock_net(ssk);
struct nlmsghdr *nlh;
struct sk_buff *skb;
@@ -1963,7 +2129,10 @@ void mptcp_event_addr_announced(const struct mptcp_sock *msk,
if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id))
goto nla_put_failure;
- if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port))
+ if (nla_put_be16(skb, MPTCP_ATTR_DPORT,
+ info->port == 0 ?
+ inet_sk(ssk)->inet_dport :
+ info->port))
goto nla_put_failure;
switch (info->family) {
@@ -2049,17 +2218,17 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
.doit = mptcp_nl_cmd_add_addr,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_DEL_ADDR,
.doit = mptcp_nl_cmd_del_addr,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
.doit = mptcp_nl_cmd_flush_addrs,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_GET_ADDR,
@@ -2069,7 +2238,7 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_SET_LIMITS,
.doit = mptcp_nl_cmd_set_limits,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = MPTCP_PM_CMD_GET_LIMITS,
@@ -2078,7 +2247,27 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_SET_FLAGS,
.doit = mptcp_nl_cmd_set_flags,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_ANNOUNCE,
+ .doit = mptcp_nl_cmd_announce,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_REMOVE,
+ .doit = mptcp_nl_cmd_remove,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE,
+ .doit = mptcp_nl_cmd_sf_create,
+ .flags = GENL_UNS_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY,
+ .doit = mptcp_nl_cmd_sf_destroy,
+ .flags = GENL_UNS_ADMIN_PERM,
},
};
@@ -2091,13 +2280,14 @@ static struct genl_family mptcp_genl_family __ro_after_init = {
.module = THIS_MODULE,
.small_ops = mptcp_pm_ops,
.n_small_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1,
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
static int __net_init pm_nl_init_net(struct net *net)
{
- struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
@@ -2119,7 +2309,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
struct net *net;
list_for_each_entry(net, net_list, exit_list) {
- struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
/* net is removed from namespace list, can't race with
* other modifiers, also netns core already waited for a
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
new file mode 100644
index 000000000000..9e82250cbb70
--- /dev/null
+++ b/net/mptcp/pm_userspace.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, Intel Corporation.
+ */
+
+#include "protocol.h"
+#include "mib.h"
+
+void mptcp_free_local_addr_list(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *tmp;
+ struct sock *sk = (struct sock *)msk;
+ LIST_HEAD(free_list);
+
+ if (!mptcp_pm_is_userspace(msk))
+ return;
+
+ spin_lock_bh(&msk->pm.lock);
+ list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
+
+ list_for_each_entry_safe(entry, tmp, &free_list, list) {
+ sock_kfree_s(sk, entry, sizeof(*entry));
+ }
+}
+
+int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct mptcp_pm_addr_entry *match = NULL;
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *e;
+ bool addr_match = false;
+ bool id_match = false;
+ int ret = -EINVAL;
+
+ bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
+ addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true);
+ if (addr_match && entry->addr.id == 0)
+ entry->addr.id = e->addr.id;
+ id_match = (e->addr.id == entry->addr.id);
+ if (addr_match && id_match) {
+ match = e;
+ break;
+ } else if (addr_match || id_match) {
+ break;
+ }
+ __set_bit(e->addr.id, id_bitmap);
+ }
+
+ if (!match && !addr_match && !id_match) {
+ /* Memory for the entry is allocated from the
+ * sock option buffer.
+ */
+ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC);
+ if (!e) {
+ spin_unlock_bh(&msk->pm.lock);
+ return -ENOMEM;
+ }
+
+ *e = *entry;
+ if (!e->addr.id)
+ e->addr.id = find_next_zero_bit(id_bitmap,
+ MPTCP_PM_MAX_ADDR_ID + 1,
+ 1);
+ list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list);
+ ret = e->addr.id;
+ } else if (match) {
+ ret = entry->addr.id;
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
+ unsigned int id,
+ u8 *flags, int *ifindex)
+{
+ struct mptcp_pm_addr_entry *entry, *match = NULL;
+
+ *flags = 0;
+ *ifindex = 0;
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (id == entry->addr.id) {
+ match = entry;
+ break;
+ }
+ }
+ spin_unlock_bh(&msk->pm.lock);
+ if (match) {
+ *flags = match->flags;
+ *ifindex = match->ifindex;
+ }
+
+ return 0;
+}
+
+int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
+ struct mptcp_addr_info *skc)
+{
+ struct mptcp_pm_addr_entry new_entry;
+ __be16 msk_sport = ((struct inet_sock *)
+ inet_sk((struct sock *)msk))->inet_sport;
+
+ memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry));
+ new_entry.addr = *skc;
+ new_entry.addr.id = 0;
+ new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
+
+ if (new_entry.addr.port == msk_sport)
+ new_entry.addr.port = 0;
+
+ return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry);
+}
+
+int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_pm_addr_entry addr_val;
+ struct mptcp_sock *msk;
+ int err = -EINVAL;
+ u32 token_val;
+
+ if (!addr || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto announce_err;
+ }
+
+ err = mptcp_pm_parse_entry(addr, info, true, &addr_val);
+ if (err < 0) {
+ GENL_SET_ERR_MSG(info, "error parsing local address");
+ goto announce_err;
+ }
+
+ if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) {
+ GENL_SET_ERR_MSG(info, "invalid addr id or flags");
+ goto announce_err;
+ }
+
+ err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val);
+ if (err < 0) {
+ GENL_SET_ERR_MSG(info, "did not match address and id");
+ goto announce_err;
+ }
+
+ lock_sock((struct sock *)msk);
+ spin_lock_bh(&msk->pm.lock);
+
+ if (mptcp_pm_alloc_anno_list(msk, &addr_val)) {
+ mptcp_pm_announce_addr(msk, &addr_val.addr, false);
+ mptcp_pm_nl_addr_send_ack(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock((struct sock *)msk);
+
+ err = 0;
+ announce_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
+ struct mptcp_pm_addr_entry *match = NULL;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_sock *msk;
+ LIST_HEAD(free_list);
+ int err = -EINVAL;
+ u32 token_val;
+ u8 id_val;
+
+ if (!id || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ id_val = nla_get_u8(id);
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto remove_err;
+ }
+
+ lock_sock((struct sock *)msk);
+
+ list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (entry->addr.id == id_val) {
+ match = entry;
+ break;
+ }
+ }
+
+ if (!match) {
+ GENL_SET_ERR_MSG(info, "address with specified id not found");
+ release_sock((struct sock *)msk);
+ goto remove_err;
+ }
+
+ list_move(&match->list, &free_list);
+
+ mptcp_pm_remove_addrs_and_subflows(msk, &free_list);
+
+ release_sock((struct sock *)msk);
+
+ list_for_each_entry_safe(match, entry, &free_list, list) {
+ sock_kfree_s((struct sock *)msk, match, sizeof(*match));
+ }
+
+ err = 0;
+ remove_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_addr_info addr_r;
+ struct mptcp_addr_info addr_l;
+ struct mptcp_sock *msk;
+ int err = -EINVAL;
+ struct sock *sk;
+ u32 token_val;
+
+ if (!laddr || !raddr || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(genl_info_net(info), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto create_err;
+ }
+
+ err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ goto create_err;
+ }
+
+ if (addr_l.id == 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id");
+ goto create_err;
+ }
+
+ err = mptcp_pm_parse_addr(raddr, info, &addr_r);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+ goto create_err;
+ }
+
+ sk = &msk->sk.icsk_inet.sk;
+ lock_sock(sk);
+
+ err = __mptcp_subflow_connect(sk, &addr_l, &addr_r);
+
+ release_sock(sk);
+
+ create_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *local,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (local->family != remote->family)
+ return NULL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ const struct inet_sock *issk;
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (local->family != ssk->sk_family)
+ continue;
+
+ issk = inet_sk(ssk);
+
+ switch (ssk->sk_family) {
+ case AF_INET:
+ if (issk->inet_saddr != local->addr.s_addr ||
+ issk->inet_daddr != remote->addr.s_addr)
+ continue;
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6: {
+ const struct ipv6_pinfo *pinfo = inet6_sk(ssk);
+
+ if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) ||
+ !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr))
+ continue;
+ break;
+ }
+#endif
+ default:
+ continue;
+ }
+
+ if (issk->inet_sport == local->port &&
+ issk->inet_dport == remote->port)
+ return ssk;
+ }
+
+ return NULL;
+}
+
+int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
+ struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct mptcp_addr_info addr_l;
+ struct mptcp_addr_info addr_r;
+ struct mptcp_sock *msk;
+ struct sock *sk, *ssk;
+ int err = -EINVAL;
+ u32 token_val;
+
+ if (!laddr || !raddr || !token) {
+ GENL_SET_ERR_MSG(info, "missing required inputs");
+ return err;
+ }
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(genl_info_net(info), token_val);
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return err;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ goto destroy_err;
+ }
+
+ err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
+ goto destroy_err;
+ }
+
+ err = mptcp_pm_parse_addr(raddr, info, &addr_r);
+ if (err < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr");
+ goto destroy_err;
+ }
+
+ if (addr_l.family != addr_r.family) {
+ GENL_SET_ERR_MSG(info, "address families do not match");
+ goto destroy_err;
+ }
+
+ if (!addr_l.port || !addr_r.port) {
+ GENL_SET_ERR_MSG(info, "missing local or remote port");
+ goto destroy_err;
+ }
+
+ sk = &msk->sk.icsk_inet.sk;
+ lock_sock(sk);
+ ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
+ if (ssk) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ mptcp_close_ssk(sk, ssk, subflow);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+ err = 0;
+ } else {
+ err = -ESRCH;
+ }
+ release_sock(sk);
+
+destroy_err:
+ sock_put((struct sock *)msk);
+ return err;
+}
+
+int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup)
+{
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+ u32 token_val;
+
+ token_val = nla_get_u32(token);
+
+ msk = mptcp_token_get_sock(net, token_val);
+ if (!msk)
+ return ret;
+
+ if (!mptcp_pm_is_userspace(msk))
+ goto set_flags_err;
+
+ if (loc->addr.family == AF_UNSPEC ||
+ rem->addr.family == AF_UNSPEC)
+ goto set_flags_err;
+
+ lock_sock((struct sock *)msk);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup);
+ release_sock((struct sock *)msk);
+
+set_flags_err:
+ sock_put((struct sock *)msk);
+ return ret;
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f60f01b14fac..b6dc6e260334 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -117,6 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
subflow->request_mptcp = 1;
+
+ /* This is the first subflow, always with id 0 */
+ subflow->local_id_valid = 1;
mptcp_sock_graft(msk->first, sk->sk_socket);
return 0;
@@ -147,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
to->len, MPTCP_SKB_CB(from)->end_seq);
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
- kfree_skb_partial(from, fragstolen);
+
+ /* note the fwd memory can reach a negative value after accounting
+ * for the delta, but the later skb free will restore a non
+ * negative one
+ */
atomic_add(delta, &sk->sk_rmem_alloc);
mptcp_rmem_charge(sk, delta);
+ kfree_skb_partial(from, fragstolen);
+
return true;
}
@@ -164,8 +173,8 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
- mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ amount >>= PAGE_SHIFT;
+ mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
@@ -178,8 +187,8 @@ static void mptcp_rmem_uncharge(struct sock *sk, int size)
reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
/* see sk_mem_uncharge() for the rationale behind the following schema */
- if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
- __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK);
+ if (unlikely(reclaimable >= PAGE_SIZE))
+ __mptcp_rmem_reclaim(sk, reclaimable);
}
static void mptcp_rfree(struct sk_buff *skb)
@@ -213,7 +222,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
seq = MPTCP_SKB_CB(skb)->map_seq;
end_seq = MPTCP_SKB_CB(skb)->end_seq;
- max_seq = READ_ONCE(msk->rcv_wnd_sent);
+ max_seq = atomic64_read(&msk->rcv_wnd_sent);
pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue));
@@ -222,7 +231,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
mptcp_drop(sk, skb);
pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
(unsigned long long)end_seq - (unsigned long)max_seq,
- (unsigned long long)msk->rcv_wnd_sent);
+ (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
return;
}
@@ -320,20 +329,16 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
struct mptcp_sock *msk = mptcp_sk(sk);
int amt, amount;
- if (size < msk->rmem_fwd_alloc)
+ if (size <= msk->rmem_fwd_alloc)
return true;
+ size -= msk->rmem_fwd_alloc;
amt = sk_mem_pages(size);
- amount = amt << SK_MEM_QUANTUM_SHIFT;
- msk->rmem_fwd_alloc += amount;
- if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
- if (ssk->sk_forward_alloc < amount) {
- msk->rmem_fwd_alloc -= amount;
- return false;
- }
+ amount = amt << PAGE_SHIFT;
+ if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
+ return false;
- ssk->sk_forward_alloc -= amount;
- }
+ msk->rmem_fwd_alloc += amount;
return true;
}
@@ -466,9 +471,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
static void mptcp_set_datafin_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 retransmits;
+
+ retransmits = min_t(u32, icsk->icsk_retransmits,
+ ilog2(TCP_RTO_MAX / TCP_RTO_MIN));
- mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX,
- TCP_RTO_MIN << icsk->icsk_retransmits);
+ mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits;
}
static void __mptcp_set_timeout(struct sock *sk, long tout)
@@ -494,19 +502,24 @@ static void mptcp_set_timeout(struct sock *sk)
__mptcp_set_timeout(sk, tout);
}
-static bool tcp_can_send_ack(const struct sock *ssk)
+static inline bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
}
-void mptcp_subflow_send_ack(struct sock *ssk)
+void __mptcp_subflow_send_ack(struct sock *ssk)
+{
+ if (tcp_can_send_ack(ssk))
+ tcp_send_ack(ssk);
+}
+
+static void mptcp_subflow_send_ack(struct sock *ssk)
{
bool slow;
slow = lock_sock_fast(ssk);
- if (tcp_can_send_ack(ssk))
- tcp_send_ack(ssk);
+ __mptcp_subflow_send_ack(ssk);
unlock_sock_fast(ssk, slow);
}
@@ -649,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
skb = skb_peek(&ssk->sk_receive_queue);
if (!skb) {
- /* if no data is found, a racing workqueue/recvmsg
- * already processed the new data, stop here or we
- * can enter an infinite loop
+ /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
+ * a different CPU can have already processed the pending
+ * data, stop here or we can enter an infinite loop
*/
if (!moved)
done = true;
@@ -659,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
if (__mptcp_check_fallback(msk)) {
- /* if we are running under the workqueue, TCP could have
- * collapsed skbs between dummy map creation and now
- * be sure to adjust the size
+ /* Under fallback skbs have no MPTCP extension and TCP could
+ * collapse them between the dummy map creation and the
+ * current dequeue. Be sure to adjust the map size.
*/
map_remaining = skb->len;
subflow->map_data_len = skb->len;
@@ -960,25 +973,6 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
-static void __mptcp_mem_reclaim_partial(struct sock *sk)
-{
- int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
-
- lockdep_assert_held_once(&sk->sk_lock.slock);
-
- if (reclaimable > SK_MEM_QUANTUM)
- __mptcp_rmem_reclaim(sk, reclaimable - 1);
-
- sk_mem_reclaim_partial(sk);
-}
-
-static void mptcp_mem_reclaim_partial(struct sock *sk)
-{
- mptcp_data_lock(sk);
- __mptcp_mem_reclaim_partial(sk);
- mptcp_data_unlock(sk);
-}
-
static void dfrag_uncharge(struct sock *sk, int len)
{
sk_mem_uncharge(sk, len);
@@ -998,7 +992,6 @@ static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- bool cleaned = false;
u64 snd_una;
/* on fallback we just need to ignore snd_una, as this is really
@@ -1021,7 +1014,6 @@ static void __mptcp_clean_una(struct sock *sk)
}
dfrag_clear(sk, dfrag);
- cleaned = true;
}
dfrag = mptcp_rtx_head(sk);
@@ -1043,7 +1035,6 @@ static void __mptcp_clean_una(struct sock *sk)
dfrag->already_sent -= delta;
dfrag_uncharge(sk, delta);
- cleaned = true;
}
/* all retransmitted data acked, recovery completed */
@@ -1051,9 +1042,6 @@ static void __mptcp_clean_una(struct sock *sk)
msk->recovery = false;
out:
- if (cleaned && tcp_under_memory_pressure(sk))
- __mptcp_mem_reclaim_partial(sk);
-
if (snd_una == READ_ONCE(msk->snd_nxt) &&
snd_una == READ_ONCE(msk->write_seq)) {
if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
@@ -1135,18 +1123,21 @@ struct mptcp_sendmsg_info {
bool data_lock_held;
};
-static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
- int avail_size)
+static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
+ u64 data_seq, int avail_size)
{
u64 window_end = mptcp_wnd_end(msk);
+ u64 mptcp_snd_wnd;
if (__mptcp_check_fallback(msk))
return avail_size;
- if (!before64(data_seq + avail_size, window_end)) {
- u64 allowed_size = window_end - data_seq;
+ mptcp_snd_wnd = window_end - data_seq;
+ avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
- return min_t(unsigned int, allowed_size, avail_size);
+ if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
+ tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED);
}
return avail_size;
@@ -1193,6 +1184,7 @@ static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, g
tcp_skb_entail(ssk, skb);
return skb;
}
+ tcp_skb_tsorted_anchor_cleanup(skb);
kfree_skb(skb);
return NULL;
}
@@ -1201,12 +1193,6 @@ static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, boo
{
gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
- if (unlikely(tcp_under_memory_pressure(sk))) {
- if (data_lock_held)
- __mptcp_mem_reclaim_partial(sk);
- else
- mptcp_mem_reclaim_partial(sk);
- }
return __mptcp_alloc_tx_skb(sk, ssk, gfp);
}
@@ -1222,6 +1208,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added)
mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
}
+static void mptcp_update_infinite_map(struct mptcp_sock *msk,
+ struct sock *ssk,
+ struct mptcp_ext *mpext)
+{
+ if (!mpext)
+ return;
+
+ mpext->infinite_map = 1;
+ mpext->data_len = 0;
+
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+ mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
+ pr_fallback(msk);
+ mptcp_do_fallback(ssk);
+}
+
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_data_frag *dfrag,
struct mptcp_sendmsg_info *info)
@@ -1244,6 +1246,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
info->limit > dfrag->data_len))
return 0;
+ if (unlikely(!__tcp_can_send(ssk)))
+ return -EAGAIN;
+
/* compute send limit */
info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
copy = info->size_goal;
@@ -1264,7 +1269,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
tcp_mark_push(tcp_sk(ssk), skb);
goto alloc_skb;
}
@@ -1282,7 +1287,7 @@ alloc_skb:
}
/* Zero window and all data acked? Probe. */
- copy = mptcp_check_allowed_size(msk, data_seq, copy);
+ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy);
if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
@@ -1353,6 +1358,9 @@ alloc_skb:
out:
if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(skb, copy);
+ if (mptcp_subflow_ctx(ssk)->send_infinite_map)
+ mptcp_update_infinite_map(msk, ssk, mpext);
+ trace_mptcp_sendmsg_frag(mpext);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
return copy;
}
@@ -1414,7 +1422,8 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
- return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ return __tcp_can_send(msk->first) &&
+ sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
/* re-use last subflow, if the burst allow that */
@@ -1472,11 +1481,16 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
* to check that subflow has a non empty cwin.
*/
ssk = send_info[SSK_MODE_ACTIVE].ssk;
- if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd)
+ if (!ssk || !sk_stream_memory_free(ssk))
return NULL;
- burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd);
+ burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
+ if (!burst) {
+ msk->last_snd = NULL;
+ return ssk;
+ }
+
subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst,
@@ -1530,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
struct mptcp_sendmsg_info info = {
.flags = flags,
};
+ bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag;
- int len, copied = 0;
+ int len;
while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent;
@@ -1560,12 +1575,14 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0) {
+ if (ret == -EAGAIN)
+ continue;
mptcp_push_release(ssk, &info);
goto out;
}
+ do_check_data_fin = true;
info.sent += ret;
- copied += ret;
len -= ret;
mptcp_update_post_push(msk, dfrag, ret);
@@ -1581,7 +1598,7 @@ out:
/* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
- if (copied)
+ if (do_check_data_fin)
__mptcp_check_send_data_fin(sk);
}
@@ -1656,10 +1673,42 @@ static void mptcp_set_nospace(struct sock *sk)
set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
}
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg,
+ size_t len, int *copied_syn)
+{
+ unsigned int saved_flags = msg->msg_flags;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret;
+
+ lock_sock(ssk);
+ msg->msg_flags |= MSG_DONTWAIT;
+ msk->connect_flags = O_NONBLOCK;
+ msk->is_sendmsg = 1;
+ ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
+ msk->is_sendmsg = 0;
+ msg->msg_flags = saved_flags;
+ release_sock(ssk);
+
+ /* do the blocking bits of inet_stream_connect outside the ssk socket lock */
+ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) {
+ ret = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, msg->msg_flags, 1);
+
+ /* Keep the same behaviour of plain TCP: zero the copied bytes in
+ * case of any error, except timeout or signal
+ */
+ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
+ *copied_syn = 0;
+ }
+
+ return ret;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
+ struct socket *ssock;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1673,14 +1722,30 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) {
+ int copied_syn = 0;
+
+ ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
+ copied += copied_syn;
+ if (ret == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (ret)
+ goto do_error;
+ }
+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
ret = sk_stream_wait_connect(sk, &timeo);
if (ret)
- goto out;
+ goto do_error;
}
+ ret = -EPIPE;
+ if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
+ goto do_error;
+
pfrag = sk_page_frag(sk);
while (msg_data_left(msg)) {
@@ -1689,11 +1754,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
bool dfrag_collapsed;
size_t psize, offset;
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
- ret = -EPIPE;
- goto out;
- }
-
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
*/
@@ -1725,7 +1785,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
ret = -EFAULT;
- goto out;
+ goto do_error;
}
/* data successfully copied into the write queue */
@@ -1757,7 +1817,7 @@ wait_for_memory:
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
- goto out;
+ goto do_error;
}
if (copied)
@@ -1765,7 +1825,14 @@ wait_for_memory:
out:
release_sock(sk);
- return copied ? : ret;
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+
+ copied = sk_stream_error(sk, msg->msg_flags, ret);
+ goto out;
}
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
@@ -1869,7 +1936,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
@@ -1887,7 +1954,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(rcvwin, advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
@@ -2004,7 +2071,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk)
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct scm_timestamping_internal tss;
@@ -2022,7 +2089,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out_err;
}
- timeo = sock_rcvtimeo(sk, nonblock);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
@@ -2268,8 +2335,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
- if (flags & MPTCP_CF_FASTCLOSE)
+ if (flags & MPTCP_CF_FASTCLOSE) {
+ /* be sure to force the tcp_disconnect() path,
+ * to generate the egress reset
+ */
+ ssk->sk_lingertime = 0;
+ sock_set_flag(ssk, SOCK_LINGER);
subflow->send_fastclose = 1;
+ }
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
@@ -2297,6 +2370,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
kfree_rcu(subflow, rcu);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
+ if (ssk->sk_state == TCP_LISTEN) {
+ tcp_set_state(ssk, TCP_CLOSE);
+ mptcp_subflow_queue_clean(ssk);
+ inet_csk_listen_stop(ssk);
+ }
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@@ -2342,7 +2420,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk)
might_sleep();
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (inet_sk_state_load(ssk) != TCP_CLOSE)
@@ -2385,7 +2463,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
mptcp_token_destroy(msk);
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
bool slow;
@@ -2397,12 +2475,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
unlock_sock_fast(tcp_sk, slow);
}
+ /* Mirror the tcp_reset() error propagation */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
+ }
+
inet_sk_state_store(sk, TCP_CLOSE);
sk->sk_shutdown = SHUTDOWN_MASK;
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
- mptcp_close_wake_up(sk);
+ /* the calling mptcp_worker will properly destroy the socket */
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk->sk_state_change(sk);
+ sk_error_report(sk);
}
static void __mptcp_retrans(struct sock *sk)
@@ -2457,6 +2554,7 @@ static void __mptcp_retrans(struct sock *sk)
dfrag->already_sent = max(dfrag->already_sent, info.sent);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
}
release_sock(ssk);
@@ -2468,10 +2566,60 @@ reset_timer:
mptcp_reset_timer(sk);
}
+/* schedule the timeout timer for the relevant event: either close timeout
+ * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
+ */
+void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned long timeout, close_timeout;
+
+ if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
+ return;
+
+ close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
+
+ /* the close timeout takes precedence on the fail one, and here at least one of
+ * them is active
+ */
+ timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
+
+ sk_reset_timer(sk, &sk->sk_timer, timeout);
+}
+
+static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
+{
+ struct sock *ssk = msk->first;
+ bool slow;
+
+ if (!ssk)
+ return;
+
+ pr_debug("MP_FAIL doesn't respond, reset the subflow");
+
+ slow = lock_sock_fast(ssk);
+ mptcp_subflow_reset(ssk);
+ WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
+ unlock_sock_fast(ssk, slow);
+
+ mptcp_reset_timeout(msk, 0);
+}
+
+static void mptcp_do_fastclose(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
+ subflow, MPTCP_CF_FASTCLOSE);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *sk = &msk->sk.icsk_inet.sk;
+ unsigned long fail_tout;
int state;
lock_sock(sk);
@@ -2495,11 +2643,15 @@ static void mptcp_worker(struct work_struct *work)
* closed, but we need the msk around to reply to incoming DATA_FIN,
* even if it is orphaned and in FIN_WAIT2 state
*/
- if (sock_flag(sk, SOCK_DEAD) &&
- (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) {
- inet_sk_state_store(sk, TCP_CLOSE);
- __mptcp_destroy_sock(sk);
- goto unlock;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ if (mptcp_check_close_timeout(sk)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ }
+ if (sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ goto unlock;
+ }
}
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@@ -2508,6 +2660,10 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
+ fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
+ if (fail_tout && time_after(jiffies, fail_tout))
+ mptcp_mp_fail_no_response(msk);
+
unlock:
release_sock(sk);
sock_put(sk);
@@ -2531,6 +2687,7 @@ static int __mptcp_init_sock(struct sock *sk)
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ WRITE_ONCE(msk->allow_infinite_fallback, true);
msk->recovery = false;
mptcp_pm_data_init(msk);
@@ -2579,8 +2736,8 @@ static int mptcp_init_sock(struct sock *sk)
mptcp_ca_reset(sk);
sk_sockets_allocated_inc(sk);
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+ sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
return 0;
}
@@ -2595,7 +2752,7 @@ static void __mptcp_clear_xmit(struct sock *sk)
dfrag_clear(sk, dfrag);
}
-static void mptcp_cancel_work(struct sock *sk)
+void mptcp_cancel_work(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2714,30 +2871,16 @@ static void __mptcp_wr_shutdown(struct sock *sk)
static void __mptcp_destroy_sock(struct sock *sk)
{
- struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
- LIST_HEAD(conn_list);
pr_debug("msk=%p", msk);
might_sleep();
- /* join list will be eventually flushed (with rst) at sock lock release time*/
- list_splice_init(&msk->conn_list, &conn_list);
-
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0;
- /* clears msk->subflow, allowing the following loop to close
- * even the initial subflow
- */
- mptcp_dispose_initial_subflow(msk);
- list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- __mptcp_close_ssk(sk, ssk, subflow, 0);
- }
-
sk->sk_prot->destroy(sk);
WARN_ON_ONCE(msk->rmem_fwd_alloc);
@@ -2749,12 +2892,24 @@ static void __mptcp_destroy_sock(struct sock *sk)
sock_put(sk);
}
-static void mptcp_close(struct sock *sk, long timeout)
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+ /* Concurrent splices from sk_receive_queue into receive_queue will
+ * always show at least one non-empty queue when checked in this order.
+ */
+ if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
+ skb_queue_empty_lockless(&msk->receive_queue))
+ return 0;
+
+ return EPOLLIN | EPOLLRDNORM;
+}
+
+bool __mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
- lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
@@ -2762,18 +2917,29 @@ static void mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_close_state(sk))
+ if (mptcp_check_readable(msk)) {
+ /* the msk has read data, do the MPTCP equivalent of TCP reset */
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ } else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
+ }
sk_stream_wait_close(sk, timeout);
cleanup:
/* orphan all the subflows */
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
- mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ /* since the close timeout takes precedence on the fail one,
+ * cancel the latter
+ */
+ if (ssk == msk->first)
+ subflow->fail_tout = 0;
+
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);
}
@@ -2782,14 +2948,25 @@ cleanup:
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (mptcp_sk(sk)->token)
- mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
- sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
+ mptcp_reset_timeout(msk, 0);
}
+
+ return do_cancel_work;
+}
+
+static void mptcp_close(struct sock *sk, long timeout)
+{
+ bool do_cancel_work;
+
+ lock_sock(sk);
+
+ do_cancel_work = __mptcp_close(sk, timeout);
release_sock(sk);
if (do_cancel_work)
mptcp_cancel_work(sk);
@@ -2797,7 +2974,7 @@ cleanup:
sock_put(sk);
}
-static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
@@ -2822,24 +2999,20 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
static int mptcp_disconnect(struct sock *sk, int flags)
{
- struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
inet_sk_state_store(sk, TCP_CLOSE);
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE);
- }
-
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
- mptcp_destroy_common(msk);
+ /* msk->subflow is still intact, the following will not free the first
+ * subflow
+ */
+ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
msk->last_snd = NULL;
WRITE_ONCE(msk->flags, 0);
msk->cb_flags = 0;
@@ -2908,7 +3081,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++;
WRITE_ONCE(msk->ack_seq, ack_seq);
- WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+ atomic64_set(&msk->rcv_wnd_sent, ack_seq);
}
sock_reset_flag(nsk, SOCK_RCU_FREE);
@@ -2989,12 +3162,17 @@ out:
return newsk;
}
-void mptcp_destroy_common(struct mptcp_sock *msk)
+void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
{
+ struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
__mptcp_clear_xmit(sk);
+ /* join list will be eventually flushed (with rst) at sock lock release time */
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
+
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
mptcp_data_lock(sk);
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
@@ -3009,13 +3187,18 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
msk->rmem_fwd_alloc = 0;
mptcp_token_destroy(msk);
mptcp_pm_free_anno_list(msk);
+ mptcp_free_local_addr_list(msk);
}
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_destroy_common(msk);
+ /* clears msk->subflow, allowing the following to close
+ * even the initial subflow
+ */
+ mptcp_dispose_initial_subflow(msk);
+ mptcp_destroy_common(msk, 0);
sk_sockets_allocated_dec(sk);
}
@@ -3084,15 +3267,19 @@ static void mptcp_release_cb(struct sock *sk)
spin_lock_bh(&sk->sk_lock.slock);
}
- /* be sure to set the current sk state before tacking actions
- * depending on sk_state
- */
- if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
- __mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk);
- if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
- __mptcp_error_report(sk);
+ if (unlikely(&msk->cb_flags)) {
+ /* be sure to set the current sk state before tacking actions
+ * depending on sk_state, that is processing MPTCP_ERROR_REPORT
+ */
+ if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
+ __mptcp_set_connected(sk);
+ if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
+ __mptcp_error_report(sk);
+ if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
+ msk->last_snd = NULL;
+ }
__mptcp_update_rmem(sk);
}
@@ -3196,9 +3383,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
WRITE_ONCE(msk->ack_seq, ack_seq);
- WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq);
+ atomic64_set(&msk->rcv_wnd_sent, ack_seq);
mptcp_pm_new_connection(msk, ssk, 0);
@@ -3229,15 +3416,12 @@ bool mptcp_finish_join(struct sock *ssk)
return false;
}
- if (!msk->pm.server_side)
+ if (!list_empty(&subflow->node))
goto out;
if (!mptcp_pm_allow_new_subflow(msk))
goto err_prohibited;
- if (WARN_ON_ONCE(!list_empty(&subflow->node)))
- goto err_prohibited;
-
/* active connections are already on conn_list.
* If we can't acquire msk socket lock here, let the release callback
* handle it
@@ -3263,6 +3447,7 @@ err_prohibited:
}
subflow->map_seq = READ_ONCE(msk->ack_seq);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
out:
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
@@ -3294,6 +3479,17 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
return 0;
delta = msk->write_seq - v;
+ if (__mptcp_check_fallback(msk) && msk->first) {
+ struct tcp_sock *tp = tcp_sk(msk->first);
+
+ /* the first subflow is disconnected after close - see
+ * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq
+ * so ignore that status, too.
+ */
+ if (!((1 << msk->first->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)))
+ delta += READ_ONCE(tp->write_seq) - tp->snd_una;
+ }
if (delta > INT_MAX)
delta = INT_MAX;
@@ -3333,10 +3529,73 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
+static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ subflow->request_mptcp = 0;
+ __mptcp_do_fallback(msk);
+}
+
+static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ int err = -EINVAL;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock)
+ return -EINVAL;
+
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssock->sk);
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
+ mptcp_subflow_early_fallback(msk, subflow);
+#endif
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
+ MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
+ mptcp_subflow_early_fallback(msk, subflow);
+ }
+ if (likely(!__mptcp_check_fallback(msk)))
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
+
+ /* if reaching here via the fastopen/sendmsg path, the caller already
+ * acquired the subflow socket lock, too.
+ */
+ if (msk->is_sendmsg)
+ err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1);
+ else
+ err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags);
+ inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
+
+ /* on successful connect, the msk state will be moved to established by
+ * subflow_finish_connect()
+ */
+ if (unlikely(err && err != -EINPROGRESS)) {
+ inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ return err;
+ }
+
+ mptcp_copy_inaddrs(sk, ssock->sk);
+
+ /* unblocking connect, mptcp-level inet_stream_connect will error out
+ * without changing the socket state, update it here.
+ */
+ if (err == -EINPROGRESS)
+ sk->sk_socket->state = ssock->state;
+ return err;
+}
+
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
.init = mptcp_init_sock,
+ .connect = mptcp_connect,
.disconnect = mptcp_disconnect,
.close = mptcp_close,
.accept = mptcp_accept,
@@ -3353,7 +3612,10 @@ static struct proto mptcp_prot = {
.get_port = mptcp_get_port,
.forward_alloc_get = mptcp_forward_alloc_get,
.sockets_allocated = &mptcp_sockets_allocated,
+
.memory_allocated = &tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
@@ -3385,77 +3647,16 @@ unlock:
return err;
}
-static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- subflow->request_mptcp = 0;
- __mptcp_do_fallback(msk);
-}
-
static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
- struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct mptcp_subflow_context *subflow;
- struct socket *ssock;
- int err = -EINVAL;
+ int ret;
lock_sock(sock->sk);
- if (uaddr) {
- if (addr_len < sizeof(uaddr->sa_family))
- goto unlock;
-
- if (uaddr->sa_family == AF_UNSPEC) {
- err = mptcp_disconnect(sock->sk, flags);
- sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
- goto unlock;
- }
- }
-
- if (sock->state != SS_UNCONNECTED && msk->subflow) {
- /* pending connection or invalid state, let existing subflow
- * cope with that
- */
- ssock = msk->subflow;
- goto do_connect;
- }
-
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock)
- goto unlock;
-
- mptcp_token_destroy(msk);
- inet_sk_state_store(sock->sk, TCP_SYN_SENT);
- subflow = mptcp_subflow_ctx(ssock->sk);
-#ifdef CONFIG_TCP_MD5SIG
- /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
- * TCP option space.
- */
- if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
- mptcp_subflow_early_fallback(msk, subflow);
-#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
- MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
- mptcp_subflow_early_fallback(msk, subflow);
- }
- if (likely(!__mptcp_check_fallback(msk)))
- MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE);
-
-do_connect:
- err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
- sock->state = ssock->state;
-
- /* on successful connect, the msk state will be moved to established by
- * subflow_finish_connect()
- */
- if (!err || err == -EINPROGRESS)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
- else
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
-
-unlock:
+ mptcp_sk(sock->sk)->connect_flags = flags;
+ ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
release_sock(sock->sk);
- return err;
+ return ret;
}
static int mptcp_listen(struct socket *sock, int backlog)
@@ -3521,7 +3722,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (mptcp_is_fully_established(newsk))
mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
- mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
mptcp_propagate_sndbuf(newsk, msk->first);
@@ -3540,18 +3740,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
return err;
}
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
-{
- /* Concurrent splices from sk_receive_queue into receive_queue will
- * always show at least one non-empty queue when checked in this order.
- */
- if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
- skb_queue_empty_lockless(&msk->receive_queue))
- return 0;
-
- return EPOLLIN | EPOLLRDNORM;
-}
-
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
@@ -3593,13 +3781,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
mask |= mptcp_check_writeable(msk);
+ } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* cf tcp_poll() note about TFO */
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
if (sk->sk_err)
mask |= EPOLLERR;
@@ -3684,8 +3875,8 @@ void __init mptcp_proto_init(void)
for_each_possible_cpu(cpu) {
delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
INIT_LIST_HEAD(&delegated->head);
- netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll,
- NAPI_POLL_WEIGHT);
+ netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+ mptcp_napi_poll);
napi_enable(&delegated->napi);
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 0e6b42c76ea0..6a09ab99a12d 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -11,6 +11,7 @@
#include <net/tcp.h>
#include <net/inet_connection_sock.h>
#include <uapi/linux/mptcp.h>
+#include <net/genetlink.h>
#define MPTCP_SUPPORTED_VERSION 1
@@ -82,7 +83,6 @@
/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP BIT(0)
-#define MPTCPOPT_HMAC_LEN 20
#define MPTCPOPT_THMAC_LEN 8
/* MPTCP MP_CAPABLE flags */
@@ -124,6 +124,7 @@
#define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6
+#define MPTCP_RESET_SCHEDULER 7
static inline bool before64(__u64 seq1, __u64 seq2)
{
@@ -182,6 +183,14 @@ enum mptcp_pm_status {
*/
};
+enum mptcp_pm_type {
+ MPTCP_PM_TYPE_KERNEL = 0,
+ MPTCP_PM_TYPE_USERSPACE,
+
+ __MPTCP_PM_TYPE_NR,
+ __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1,
+};
+
/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)
@@ -198,6 +207,7 @@ struct mptcp_pm_data {
struct mptcp_addr_info local;
struct mptcp_addr_info remote;
struct list_head anno_list;
+ struct list_head userspace_pm_local_addr_list;
spinlock_t lock; /*protects the whole PM data */
@@ -210,6 +220,7 @@ struct mptcp_pm_data {
u8 add_addr_signaled;
u8 add_addr_accepted;
u8 local_addr_used;
+ u8 pm_type;
u8 subflows;
u8 status;
DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
@@ -217,6 +228,14 @@ struct mptcp_pm_data {
struct mptcp_rm_list rm_list_rx;
};
+struct mptcp_pm_addr_entry {
+ struct list_head list;
+ struct mptcp_addr_info addr;
+ u8 flags;
+ int ifindex;
+ struct socket *lsk;
+};
+
struct mptcp_data_frag {
struct list_head list;
u64 data_seq;
@@ -236,7 +255,7 @@ struct mptcp_sock {
u64 write_seq;
u64 snd_nxt;
u64 ack_seq;
- u64 rcv_wnd_sent;
+ atomic64_t rcv_wnd_sent;
u64 rcv_data_fin_seq;
int rmem_fwd_alloc;
struct sock *last_snd;
@@ -262,9 +281,13 @@ struct mptcp_sock {
bool rcv_fastclose;
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
bool csum_enabled;
+ bool allow_infinite_fallback;
+ u8 mpc_endpoint_id;
u8 recvmsg_inq:1,
cork:1,
- nodelay:1;
+ nodelay:1,
+ is_sendmsg:1;
+ int connect_flags;
struct work_struct work;
struct sk_buff *ooo_last_skb;
struct rb_root out_of_order_queue;
@@ -285,6 +308,7 @@ struct mptcp_sock {
u32 setsockopt_seq;
char ca_name[TCP_CA_NAME_MAX];
+ struct mptcp_sock *dl_next;
};
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
@@ -292,6 +316,8 @@ struct mptcp_sock {
#define mptcp_for_each_subflow(__msk, __subflow) \
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \
+ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)
static inline void msk_owned_by_me(const struct mptcp_sock *msk)
{
@@ -408,7 +434,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
- char reset_start[0];
+ struct_group(reset,
unsigned long avg_pacing_rate; /* protected by msk socket lock */
u64 local_key;
@@ -439,10 +465,13 @@ struct mptcp_subflow_context {
send_mp_prio : 1,
send_mp_fail : 1,
send_fastclose : 1,
+ send_infinite_map : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
disposable : 1, /* ctx can be free at ulp release time */
- stale : 1; /* unable to snd/rcv data, do not use for xmit */
+ stale : 1, /* unable to snd/rcv data, do not use for xmit */
+ local_id_valid : 1, /* local_id is correctly initialized */
+ valid_csum_seen : 1; /* at least one csum validated */
enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
@@ -457,8 +486,9 @@ struct mptcp_subflow_context {
u8 stale_count;
long delegated_status;
+ unsigned long fail_tout;
- char reset_end[0];
+ );
struct list_head delegated_node; /* link into delegated_action, protected by local BH */
@@ -468,9 +498,7 @@ struct mptcp_subflow_context {
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
const struct inet_connection_sock_af_ops *icsk_af_ops;
- void (*tcp_data_ready)(struct sock *sk);
void (*tcp_state_change)(struct sock *sk);
- void (*tcp_write_space)(struct sock *sk);
void (*tcp_error_report)(struct sock *sk);
struct rcu_head rcu;
@@ -494,7 +522,7 @@ mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
static inline void
mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
{
- memset(subflow->reset_start, 0, subflow->reset_end - subflow->reset_start);
+ memset(&subflow->reset, 0, sizeof(subflow->reset));
subflow->request_mptcp = 1;
}
@@ -572,6 +600,8 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
+int mptcp_get_pm_type(const struct net *net);
+void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
@@ -582,10 +612,16 @@ void __init mptcp_subflow_init(void);
void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow);
-void mptcp_subflow_send_ack(struct sock *ssk);
+void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_subflow_queue_clean(struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
+bool __mptcp_close(struct sock *sk, long timeout);
+void mptcp_cancel_work(struct sock *sk);
+
+bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
+ const struct mptcp_addr_info *b, bool use_port);
/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
@@ -595,16 +631,19 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
-static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+static inline bool __tcp_can_send(const struct sock *ssk)
{
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ /* only send if our side has not closed yet */
+ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
if (subflow->request_join && !subflow->fully_established)
return false;
- /* only send if our side has not closed yet */
- return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+ return __tcp_can_send(mptcp_subflow_tcp_sock(subflow));
}
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow);
@@ -614,27 +653,14 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
{
- sk->sk_data_ready = ctx->tcp_data_ready;
+ sk->sk_data_ready = sock_def_readable;
sk->sk_state_change = ctx->tcp_state_change;
- sk->sk_write_space = ctx->tcp_write_space;
+ sk->sk_write_space = sk_stream_write_space;
sk->sk_error_report = ctx->tcp_error_report;
inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
}
-static inline bool mptcp_has_another_subflow(struct sock *ssk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk), *tmp;
- struct mptcp_sock *msk = mptcp_sk(subflow->conn);
-
- mptcp_for_each_subflow(msk, tmp) {
- if (tmp != subflow)
- return true;
- }
-
- return false;
-}
-
void __init mptcp_proto_init(void);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
int __init mptcp_proto_v6_init(void);
@@ -643,12 +669,12 @@ int __init mptcp_proto_v6_init(void);
struct sock *mptcp_sk_clone(const struct sock *sk,
const struct mptcp_options_received *mp_opt,
struct request_sock *req);
-void mptcp_get_options(const struct sock *sk,
- const struct sk_buff *skb,
+void mptcp_get_options(const struct sk_buff *skb,
struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
+void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
static inline bool mptcp_is_fully_established(struct sock *sk)
{
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
@@ -701,7 +727,7 @@ static inline void mptcp_write_space(struct sock *sk)
}
}
-void mptcp_destroy_common(struct mptcp_sock *msk);
+void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags);
#define MPTCP_TOKEN_MAX_RETRIES 4
@@ -725,11 +751,16 @@ void mptcp_token_destroy(struct mptcp_sock *msk);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
-u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);
+__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);
void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
void mptcp_pm_data_reset(struct mptcp_sock *msk);
+int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ struct mptcp_addr_info *addr);
+int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry);
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
@@ -740,37 +771,60 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk);
bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk);
void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
const struct mptcp_subflow_context *subflow);
-void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+void mptcp_pm_add_addr_received(const struct sock *ssk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr);
+ const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq);
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ struct mptcp_addr_info *rem,
+ u8 bkup);
+bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
+ const struct mptcp_pm_addr_entry *entry);
void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr, bool check_id);
+ const struct mptcp_addr_info *addr, bool check_id);
struct mptcp_pm_add_entry *
-mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk,
- struct mptcp_addr_info *addr);
-int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
+mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
+ unsigned int id,
u8 *flags, int *ifindex);
-
+int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
+ unsigned int id,
+ u8 *flags, int *ifindex);
+int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token,
+ struct mptcp_pm_addr_entry *loc,
+ struct mptcp_pm_addr_entry *rem, u8 bkup);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool echo);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
+void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk,
+ struct list_head *rm_list);
+
+int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry);
+void mptcp_free_local_addr_list(struct mptcp_sock *msk);
+int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info);
+int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info);
void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
const struct sock *ssk, gfp_t gfp);
-void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info);
+void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info);
void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
+bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);
static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
@@ -793,6 +847,16 @@ static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL);
}
+static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE;
+}
+
+static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL;
+}
+
static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
{
u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
@@ -816,24 +880,38 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
}
-bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb,
+bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
unsigned int opt_size, unsigned int remaining,
struct mptcp_addr_info *addr, bool *echo,
- bool *port, bool *drop_other_suboptions);
+ bool *drop_other_suboptions);
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
void __init mptcp_pm_nl_init(void);
-void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
void mptcp_pm_nl_work(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
const struct mptcp_rm_list *rm_list);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
-unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);
+
+/* called under PM lock */
+static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk)
+{
+ if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk))
+ WRITE_ONCE(msk->pm.accept_subflow, true);
+}
+
+static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
+{
+ spin_lock_bh(&msk->pm.lock);
+ __mptcp_pm_close_subflow(msk);
+ spin_unlock_bh(&msk->pm.lock);
+}
void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
@@ -867,23 +945,51 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
-static inline void mptcp_do_fallback(struct sock *sk)
+static inline void mptcp_do_fallback(struct sock *ssk)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+ struct mptcp_sock *msk;
+ msk = mptcp_sk(sk);
__mptcp_do_fallback(msk);
+ if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
+ gfp_t saved_allocation = ssk->sk_allocation;
+
+ /* we are in a atomic (BH) scope, override ssk default for data
+ * fin allocation
+ */
+ ssk->sk_allocation = GFP_ATOMIC;
+ ssk->sk_shutdown |= SEND_SHUTDOWN;
+ tcp_shutdown(ssk, SEND_SHUTDOWN);
+ ssk->sk_allocation = saved_allocation;
+ }
}
#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)
+static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
+{
+ struct mptcp_ext *mpext;
+
+ mpext = skb ? mptcp_get_ext(skb) : NULL;
+ if (mpext && mpext->infinite_map)
+ return true;
+
+ return false;
+}
+
+static inline bool is_active_ssk(struct mptcp_subflow_context *subflow)
+{
+ return (subflow->request_mptcp || subflow->request_join);
+}
+
static inline bool subflow_simultaneous_connect(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sock *parent = subflow->conn;
return sk->sk_state == TCP_ESTABLISHED &&
- !mptcp_sk(parent)->pm.server_side &&
+ is_active_ssk(subflow) &&
!subflow->conn_finished;
}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index dacf3cee0027..c7cb68c725b2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -343,6 +343,8 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
case SO_RCVLOWAT:
case SO_RCVTIMEO_OLD:
case SO_RCVTIMEO_NEW:
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
case SO_BUSY_POLL:
case SO_PREFER_BUSY_POLL:
case SO_BUSY_POLL_BUDGET:
@@ -557,6 +559,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
case TCP_NOTSENT_LOWAT:
case TCP_TX_DELAY:
case TCP_INQ:
+ case TCP_FASTOPEN_CONNECT:
return true;
}
@@ -565,7 +568,7 @@ static bool mptcp_supported_sockopt(int level, int optname)
/* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
* TCP_REPAIR_WINDOW are not supported, better avoid this mess
*/
- /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE,
+ /* TCP_FASTOPEN_KEY, TCP_FASTOPEN, TCP_FASTOPEN_NO_COOKIE,
* are not supported fastopen is currently unsupported
*/
}
@@ -754,6 +757,31 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
return -EOPNOTSUPP;
}
+static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct socket *listener;
+
+ listener = __mptcp_nmpc_socket(msk);
+ if (!listener)
+ return 0; /* TCP_DEFER_ACCEPT does not fail */
+
+ return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen);
+}
+
+static int mptcp_setsockopt_sol_tcp_fastopen_connect(struct mptcp_sock *msk, sockptr_t optval,
+ unsigned int optlen)
+{
+ struct socket *sock;
+
+ /* Limit to first subflow */
+ sock = __mptcp_nmpc_socket(msk);
+ if (!sock)
+ return -EINVAL;
+
+ return tcp_setsockopt(sock->sk, SOL_TCP, TCP_FASTOPEN_CONNECT, optval, optlen);
+}
+
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -780,6 +808,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
case TCP_NODELAY:
return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
+ case TCP_DEFER_ACCEPT:
+ return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen);
+ case TCP_FASTOPEN_CONNECT:
+ return mptcp_setsockopt_sol_tcp_fastopen_connect(msk, optval, optlen);
}
return -EOPNOTSUPP;
@@ -851,15 +883,11 @@ out:
void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
{
- struct sock *sk = &msk->sk.icsk_inet.sk;
u32 flags = 0;
- bool slow;
u8 val;
memset(info, 0, sizeof(*info));
- slow = lock_sock_fast(sk);
-
info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
@@ -880,8 +908,6 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
-
- unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
@@ -1146,6 +1172,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
case TCP_CONGESTION:
case TCP_INFO:
case TCP_CC_INFO:
+ case TCP_DEFER_ACCEPT:
+ case TCP_FASTOPEN_CONNECT:
return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
optval, optlen);
case TCP_INQ:
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index bea47a1180dc..02a54d59697b 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -62,7 +62,9 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
{
return mptcp_is_fully_established((void *)msk) &&
- READ_ONCE(msk->pm.accept_subflow);
+ ((mptcp_pm_is_userspace(msk) &&
+ mptcp_userspace_pm_active(msk)) ||
+ READ_ONCE(msk->pm.accept_subflow));
}
/* validate received token and create truncated hmac and nonce for SYN-ACK */
@@ -153,7 +155,7 @@ static int subflow_check_req(struct request_sock *req,
return -EINVAL;
#endif
- mptcp_get_options(sk_listener, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
@@ -250,7 +252,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
int err;
subflow_init_req(req, sk_listener);
- mptcp_get_options(sk_listener, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
@@ -344,9 +346,7 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
thmac = get_unaligned_be64(hmac);
pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
- subflow, subflow->token,
- (unsigned long long)thmac,
- (unsigned long long)subflow->thmac);
+ subflow, subflow->token, thmac, subflow->thmac);
return thmac == subflow->thmac;
}
@@ -410,7 +410,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
if (subflow->request_mptcp) {
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
MPTCP_INC_STATS(sock_net(sk),
@@ -443,6 +443,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->backup = mp_opt.backup;
subflow->thmac = mp_opt.thmac;
subflow->remote_nonce = mp_opt.nonce;
+ subflow->remote_id = mp_opt.join_id;
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
subflow, subflow->thmac, subflow->remote_nonce,
subflow->backup);
@@ -483,9 +484,53 @@ do_reset:
mptcp_subflow_reset(sk);
}
+static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
+{
+ subflow->local_id = local_id;
+ subflow->local_id_valid = 1;
+}
+
+static int subflow_chk_local_id(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ int err;
+
+ if (likely(subflow->local_id_valid))
+ return 0;
+
+ err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+ if (err < 0)
+ return err;
+
+ subflow_set_local_id(subflow, err);
+ return 0;
+}
+
+static int subflow_rebuild_header(struct sock *sk)
+{
+ int err = subflow_chk_local_id(sk);
+
+ if (unlikely(err < 0))
+ return err;
+
+ return inet_sk_rebuild_header(sk);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static int subflow_v6_rebuild_header(struct sock *sk)
+{
+ int err = subflow_chk_local_id(sk);
+
+ if (unlikely(err < 0))
+ return err;
+
+ return inet6_sk_rebuild_header(sk);
+}
+#endif
+
struct request_sock_ops mptcp_subflow_request_sock_ops;
-EXPORT_SYMBOL_GPL(mptcp_subflow_request_sock_ops);
-static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
@@ -506,9 +551,9 @@ drop:
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
-static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
-static struct inet_connection_sock_af_ops subflow_v6_specific;
-static struct inet_connection_sock_af_ops subflow_v6m_specific;
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
+static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
+static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
static struct proto tcpv6_prot_override;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -557,29 +602,6 @@ static bool subflow_hmac_valid(const struct request_sock *req,
return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
}
-static void mptcp_sock_destruct(struct sock *sk)
-{
- /* if new mptcp socket isn't accepted, it is free'd
- * from the tcp listener sockets request queue, linked
- * from req->sk. The tcp socket is released.
- * This calls the ULP release function which will
- * also remove the mptcp socket, via
- * sock_put(ctx->conn).
- *
- * Problem is that the mptcp socket will be in
- * ESTABLISHED state and will not have the SOCK_DEAD flag.
- * Both result in warnings from inet_sock_destruct.
- */
- if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
- sk->sk_state = TCP_CLOSE;
- WARN_ON_ONCE(sk->sk_socket);
- sock_orphan(sk);
- }
-
- mptcp_destroy_common(mptcp_sk(sk));
- inet_sock_destruct(sk);
-}
-
static void mptcp_force_close(struct sock *sk)
{
/* the msk is not yet exposed to user-space */
@@ -663,7 +685,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
* reordered MPC will cause fallback, but we don't have other
* options.
*/
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
fallback = true;
goto create_child;
@@ -673,7 +695,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
if (!new_msk)
fallback = true;
} else if (subflow_req->mp_join) {
- mptcp_get_options(sk, skb, &mp_opt);
+ mptcp_get_options(skb, &mp_opt);
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) ||
!subflow_hmac_valid(req, &mp_opt) ||
!mptcp_can_accept_new_subflow(subflow_req->msk)) {
@@ -701,6 +723,8 @@ create_child:
goto dispose_child;
}
+ if (new_msk)
+ mptcp_copy_inaddrs(new_msk, child);
subflow_drop_ctx(child);
goto out;
}
@@ -722,13 +746,17 @@ create_child:
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
- new_msk->sk_destruct = mptcp_sock_destruct;
mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq;
mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1);
mptcp_token_accept(subflow_req, mptcp_sk(new_msk));
ctx->conn = new_msk;
new_msk = NULL;
+ /* set msk addresses early to ensure mptcp_pm_get_local_id()
+ * uses the correct data
+ */
+ mptcp_copy_inaddrs(ctx->conn, child);
+
/* with OoO packets we can reach here without ingress
* mpc option
*/
@@ -790,7 +818,7 @@ dispose_child:
return child;
}
-static struct inet_connection_sock_af_ops subflow_specific;
+static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
static struct proto tcp_prot_override;
enum mapping_status {
@@ -798,7 +826,8 @@ enum mapping_status {
MAPPING_INVALID,
MAPPING_EMPTY,
MAPPING_DATA_FIN,
- MAPPING_DUMMY
+ MAPPING_DUMMY,
+ MAPPING_BAD_CSUM
};
static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
@@ -846,7 +875,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
u32 offset, seq, delta;
- u16 csum;
+ __sum16 csum;
int len;
if (!csum_reqd)
@@ -913,11 +942,10 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *
subflow->map_data_csum);
if (unlikely(csum)) {
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
- subflow->send_mp_fail = 1;
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX);
- return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY;
+ return MAPPING_BAD_CSUM;
}
+ subflow->valid_csum_seen = 1;
return MAPPING_OK;
}
@@ -964,7 +992,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
data_len = mpext->data_len;
if (data_len == 0) {
+ pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
+ subflow->map_data_len = 0;
return MAPPING_INVALID;
}
@@ -1099,6 +1129,45 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss
}
}
+static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+
+ if (subflow->mp_join)
+ return false;
+ else if (READ_ONCE(msk->csum_enabled))
+ return !subflow->valid_csum_seen;
+ else
+ return !subflow->fully_established;
+}
+
+static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned long fail_tout;
+
+ /* greceful failure can happen only on the MPC subflow */
+ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
+ return;
+
+ /* since the close timeout take precedence on the fail one,
+ * no need to start the latter when the first is already set
+ */
+ if (sock_flag((struct sock *)msk, SOCK_DEAD))
+ return;
+
+ /* we don't need extreme accuracy here, use a zero fail_tout as special
+ * value meaning no fail timeout at all;
+ */
+ fail_tout = jiffies + TCP_RTO_MAX;
+ if (!fail_tout)
+ fail_tout = 1;
+ WRITE_ONCE(subflow->fail_tout, fail_tout);
+ tcp_send_ack(ssk);
+
+ mptcp_reset_timeout(msk, subflow->fail_tout);
+}
+
static bool subflow_check_data_avail(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -1107,7 +1176,7 @@ static bool subflow_check_data_avail(struct sock *ssk)
struct sk_buff *skb;
if (!skb_peek(&ssk->sk_receive_queue))
- WRITE_ONCE(subflow->data_avail, 0);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
if (subflow->data_avail)
return true;
@@ -1118,10 +1187,8 @@ static bool subflow_check_data_avail(struct sock *ssk)
status = get_mapping_status(ssk, msk);
trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
- if (unlikely(status == MAPPING_INVALID))
- goto fallback;
-
- if (unlikely(status == MAPPING_DUMMY))
+ if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
+ status == MAPPING_BAD_CSUM))
goto fallback;
if (status != MAPPING_OK)
@@ -1161,35 +1228,42 @@ no_data:
return false;
fallback:
- /* RFC 8684 section 3.7. */
- if (subflow->send_mp_fail) {
- if (mptcp_has_another_subflow(ssk)) {
+ if (!__mptcp_check_fallback(msk)) {
+ /* RFC 8684 section 3.7. */
+ if (status == MAPPING_BAD_CSUM &&
+ (subflow->mp_join || subflow->valid_csum_seen)) {
+ subflow->send_mp_fail = 1;
+
+ if (!READ_ONCE(msk->allow_infinite_fallback)) {
+ subflow->reset_transient = 0;
+ subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
+ goto reset;
+ }
+ mptcp_subflow_fail(msk, ssk);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
+ return true;
+ }
+
+ if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
+ /* fatal protocol error, close the socket.
+ * subflow_error_report() will introduce the appropriate barriers
+ */
+ subflow->reset_transient = 0;
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
+
+reset:
+ ssk->sk_err = EBADMSG;
+ tcp_set_state(ssk, TCP_CLOSE);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
+ return false;
}
- ssk->sk_err = EBADMSG;
- tcp_set_state(ssk, TCP_CLOSE);
- subflow->reset_transient = 0;
- subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
- tcp_send_active_reset(ssk, GFP_ATOMIC);
- WRITE_ONCE(subflow->data_avail, 0);
- return true;
- }
- if (subflow->mp_join || subflow->fully_established) {
- /* fatal protocol error, close the socket.
- * subflow_error_report() will introduce the appropriate barriers
- */
- ssk->sk_err = EBADMSG;
- tcp_set_state(ssk, TCP_CLOSE);
- subflow->reset_transient = 0;
- subflow->reset_reason = MPTCP_RST_EMPTCP;
- tcp_send_active_reset(ssk, GFP_ATOMIC);
- WRITE_ONCE(subflow->data_avail, 0);
- return false;
+ mptcp_do_fallback(ssk);
}
- __mptcp_do_fallback(msk);
skb = skb_peek(&ssk->sk_receive_queue);
subflow->map_valid = 1;
subflow->map_seq = READ_ONCE(msk->ack_seq);
@@ -1207,7 +1281,7 @@ bool mptcp_subflow_data_available(struct sock *sk)
if (subflow->map_valid &&
mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
subflow->map_valid = 0;
- WRITE_ONCE(subflow->data_avail, 0);
+ WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
pr_debug("Done with mapping: seq=%u data_len=%u",
subflow->map_subflow_seq,
@@ -1311,7 +1385,7 @@ static void subflow_write_space(struct sock *ssk)
mptcp_write_space(sk);
}
-static struct inet_connection_sock_af_ops *
+static const struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -1326,7 +1400,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_connection_sock_af_ops *target;
+ const struct inet_connection_sock_af_ops *target;
target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
@@ -1380,20 +1454,20 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
struct sockaddr_storage addr;
int remote_id = remote->id;
int local_id = loc->id;
+ int err = -ENOTCONN;
struct socket *sf;
struct sock *ssk;
u32 remote_token;
int addrlen;
int ifindex;
u8 flags;
- int err;
if (!mptcp_is_fully_established(sk))
- return -ENOTCONN;
+ goto err_out;
err = mptcp_subflow_create_socket(sk, &sf);
if (err)
- return err;
+ goto err_out;
ssk = sf->sk;
subflow = mptcp_subflow_ctx(ssk);
@@ -1401,15 +1475,10 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
get_random_bytes(&subflow->local_nonce, sizeof(u32));
} while (!subflow->local_nonce);
- if (!local_id) {
- err = mptcp_pm_get_local_id(msk, (struct sock_common *)ssk);
- if (err < 0)
- goto failed;
+ if (local_id)
+ subflow_set_local_id(subflow, local_id);
- local_id = err;
- }
-
- mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id,
+ mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
&flags, &ifindex);
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
@@ -1432,7 +1501,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
remote_token, local_id, remote_id);
subflow->remote_token = remote_token;
- subflow->local_id = local_id;
subflow->remote_id = remote_id;
subflow->request_join = 1;
subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
@@ -1447,7 +1515,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
/* discard the subflow socket */
mptcp_sock_graft(ssk, sk->sk_socket);
iput(SOCK_INODE(sf));
- return err;
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ return 0;
failed_unlink:
list_del(&subflow->node);
@@ -1456,6 +1525,12 @@ failed_unlink:
failed:
subflow->disposable = 1;
sock_release(sf);
+
+err_out:
+ /* we account subflows before the creation, and this failures will not
+ * be caught by sk_state_change()
+ */
+ mptcp_pm_close_subflow(msk);
return err;
}
@@ -1542,7 +1617,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
/* the newly created socket really belongs to the owning MPTCP master
* socket, even if for additional subflows the allocation is performed
* by a kernel workqueue. Adjust inode references, so that the
- * procfs/diag interaces really show this one belonging to the correct
+ * procfs/diag interfaces really show this one belonging to the correct
* user.
*/
SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
@@ -1631,6 +1706,64 @@ static void subflow_state_change(struct sock *sk)
}
}
+void mptcp_subflow_queue_clean(struct sock *listener_ssk)
+{
+ struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
+ struct mptcp_sock *msk, *next, *head = NULL;
+ struct request_sock *req;
+
+ /* build a list of all unaccepted mptcp sockets */
+ spin_lock_bh(&queue->rskq_lock);
+ for (req = queue->rskq_accept_head; req; req = req->dl_next) {
+ struct mptcp_subflow_context *subflow;
+ struct sock *ssk = req->sk;
+ struct mptcp_sock *msk;
+
+ if (!sk_is_mptcp(ssk))
+ continue;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ if (!subflow || !subflow->conn)
+ continue;
+
+ /* skip if already in list */
+ msk = mptcp_sk(subflow->conn);
+ if (msk->dl_next || msk == head)
+ continue;
+
+ msk->dl_next = head;
+ head = msk;
+ }
+ spin_unlock_bh(&queue->rskq_lock);
+ if (!head)
+ return;
+
+ /* can't acquire the msk socket lock under the subflow one,
+ * or will cause ABBA deadlock
+ */
+ release_sock(listener_ssk);
+
+ for (msk = head; msk; msk = next) {
+ struct sock *sk = (struct sock *)msk;
+ bool slow, do_cancel_work;
+
+ sock_hold(sk);
+ slow = lock_sock_fast_nested(sk);
+ next = msk->dl_next;
+ msk->first = NULL;
+ msk->dl_next = NULL;
+
+ do_cancel_work = __mptcp_close(sk, 0);
+ unlock_sock_fast(sk, slow);
+ if (do_cancel_work)
+ mptcp_cancel_work(sk);
+ sock_put(sk);
+ }
+
+ /* we are still under the listener msk socket lock */
+ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
+}
+
static int subflow_ulp_init(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1657,10 +1790,12 @@ static int subflow_ulp_init(struct sock *sk)
tp->is_mptcp = 1;
ctx->icsk_af_ops = icsk->icsk_af_ops;
icsk->icsk_af_ops = subflow_default_af_ops(sk);
- ctx->tcp_data_ready = sk->sk_data_ready;
ctx->tcp_state_change = sk->sk_state_change;
- ctx->tcp_write_space = sk->sk_write_space;
ctx->tcp_error_report = sk->sk_error_report;
+
+ WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
+ WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);
+
sk->sk_data_ready = subflow_data_ready;
sk->sk_write_space = subflow_write_space;
sk->sk_state_change = subflow_state_change;
@@ -1715,9 +1850,7 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
- new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
- new_ctx->tcp_write_space = old_ctx->tcp_write_space;
new_ctx->tcp_error_report = old_ctx->tcp_error_report;
new_ctx->rel_write_seq = 1;
new_ctx->tcp_sock = newsk;
@@ -1731,15 +1864,22 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->token = subflow_req->token;
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->idsn = subflow_req->idsn;
+
+ /* this is the first subflow, id is always 0 */
+ new_ctx->local_id_valid = 1;
} else if (subflow_req->mp_join) {
new_ctx->ssn_offset = subflow_req->ssn_offset;
new_ctx->mp_join = 1;
new_ctx->fully_established = 1;
new_ctx->backup = subflow_req->backup;
- new_ctx->local_id = subflow_req->local_id;
new_ctx->remote_id = subflow_req->remote_id;
new_ctx->token = subflow_req->token;
new_ctx->thmac = subflow_req->thmac;
+
+ /* the subflow req id is valid, fetched via subflow_check_req()
+ * and subflow_token_join_request()
+ */
+ subflow_set_local_id(new_ctx, subflow_req->local_id);
}
}
@@ -1792,6 +1932,7 @@ void __init mptcp_subflow_init(void)
subflow_specific.conn_request = subflow_v4_conn_request;
subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_specific.rebuild_header = subflow_rebuild_header;
tcp_prot_override = tcp_prot;
tcp_prot_override.release_cb = tcp_release_cb_override;
@@ -1804,6 +1945,7 @@ void __init mptcp_subflow_init(void)
subflow_v6_specific.conn_request = subflow_v6_conn_request;
subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header;
subflow_v6m_specific = subflow_v6_specific;
subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
@@ -1811,6 +1953,7 @@ void __init mptcp_subflow_init(void)
subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
subflow_v6m_specific.net_frag_header_len = 0;
+ subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
tcpv6_prot_override = tcpv6_prot;
tcpv6_prot_override.release_cb = tcp_release_cb_override;