aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/crypto.c28
-rw-r--r--net/mptcp/options.c138
-rw-r--r--net/mptcp/pm_netlink.c12
-rw-r--r--net/mptcp/protocol.c288
-rw-r--r--net/mptcp/protocol.h51
-rw-r--r--net/mptcp/subflow.c244
6 files changed, 519 insertions, 242 deletions
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index c151628bd416..3d980713a9e2 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -47,8 +47,6 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
- __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
- __be32 *hash_out = (__force __be32 *)hmac;
struct sha256_state state;
u8 key1be[8];
u8 key2be[8];
@@ -61,7 +59,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
put_unaligned_be64(key2, key2be);
/* Generate key xored with ipad */
- memset(input, 0x36, SHA_MESSAGE_BYTES);
+ memset(input, 0x36, SHA256_BLOCK_SIZE);
for (i = 0; i < 8; i++)
input[i] ^= key1be[i];
for (i = 0; i < 8; i++)
@@ -78,7 +76,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
sha256_final(&state, &input[SHA256_BLOCK_SIZE]);
/* Prepare second part of hmac */
- memset(input, 0x5C, SHA_MESSAGE_BYTES);
+ memset(input, 0x5C, SHA256_BLOCK_SIZE);
for (i = 0; i < 8; i++)
input[i] ^= key1be[i];
for (i = 0; i < 8; i++)
@@ -86,11 +84,7 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
sha256_init(&state);
sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE);
- sha256_final(&state, (u8 *)mptcp_hashed_key);
-
- /* takes only first 160 bits */
- for (i = 0; i < 5; i++)
- hash_out[i] = mptcp_hashed_key[i];
+ sha256_final(&state, (u8 *)hmac);
}
#ifdef CONFIG_MPTCP_HMAC_TEST
@@ -101,29 +95,29 @@ struct test_cast {
};
/* we can't reuse RFC 4231 test vectors, as we have constraint on the
- * input and key size, and we truncate the output.
+ * input and key size.
*/
static struct test_cast tests[] = {
{
.key = "0b0b0b0b0b0b0b0b",
.msg = "48692054",
- .result = "8385e24fb4235ac37556b6b886db106284a1da67",
+ .result = "8385e24fb4235ac37556b6b886db106284a1da671699f46db1f235ec622dcafa",
},
{
.key = "aaaaaaaaaaaaaaaa",
.msg = "dddddddd",
- .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492",
+ .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492984e1eb71aff9022f71046e9",
},
{
.key = "0102030405060708",
.msg = "cdcdcdcd",
- .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6",
+ .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6f23b4d8c4da736a5dbbc6e7d",
},
};
static int __init test_mptcp_crypto(void)
{
- char hmac[20], hmac_hex[41];
+ char hmac[32], hmac_hex[65];
u32 nonce1, nonce2;
u64 key1, key2;
u8 msg[8];
@@ -140,11 +134,11 @@ static int __init test_mptcp_crypto(void)
put_unaligned_be32(nonce2, &msg[4]);
mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
- for (j = 0; j < 20; ++j)
+ for (j = 0; j < 32; ++j)
sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
- hmac_hex[40] = 0;
+ hmac_hex[64] = 0;
- if (memcmp(hmac_hex, tests[i].result, 40))
+ if (memcmp(hmac_hex, tests[i].result, 64))
pr_err("test %d failed, got %s expected %s", i,
hmac_hex, tests[i].result);
else
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index faf57585b892..01f1f4cf4902 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -7,6 +7,7 @@
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/kernel.h>
+#include <crypto/sha.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
@@ -16,10 +17,10 @@ static bool mptcp_cap_flag_sha256(u8 flags)
return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
}
-void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
- int opsize, struct tcp_options_received *opt_rx)
+static void mptcp_parse_option(const struct sk_buff *skb,
+ const unsigned char *ptr, int opsize,
+ struct mptcp_options_received *mp_opt)
{
- struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
u8 subtype = *ptr >> 4;
int expected_opsize;
u8 version;
@@ -283,12 +284,20 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
}
void mptcp_get_options(const struct sk_buff *skb,
- struct tcp_options_received *opt_rx)
+ struct mptcp_options_received *mp_opt)
{
- const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
- int length = (th->doff * 4) - sizeof(struct tcphdr);
+ const unsigned char *ptr;
+ int length;
+
+ /* initialize option status */
+ mp_opt->mp_capable = 0;
+ mp_opt->mp_join = 0;
+ mp_opt->add_addr = 0;
+ mp_opt->rm_addr = 0;
+ mp_opt->dss = 0;
+ length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
while (length > 0) {
@@ -308,7 +317,7 @@ void mptcp_get_options(const struct sk_buff *skb,
if (opsize > length)
return; /* don't parse partial options */
if (opcode == TCPOPT_MPTCP)
- mptcp_parse_option(skb, ptr, opsize, opt_rx);
+ mptcp_parse_option(skb, ptr, opsize, mp_opt);
ptr += opsize - 2;
length -= opsize;
}
@@ -344,28 +353,6 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
return false;
}
-void mptcp_rcv_synsent(struct sock *sk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
- subflow->mp_capable = 1;
- subflow->can_ack = 1;
- subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
- pr_debug("subflow=%p, remote_key=%llu", subflow,
- subflow->remote_key);
- } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
- subflow->mp_join = 1;
- subflow->thmac = tp->rx_opt.mptcp.thmac;
- subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
- pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
- subflow->thmac, subflow->remote_nonce);
- } else if (subflow->request_mptcp) {
- tcp_sk(sk)->is_mptcp = 0;
- }
-}
-
/* MP_JOIN client subflow must wait for 4th ack before sending any data:
* TCP can't schedule delack timer before the subflow is fully established.
* MPTCP uses the delack timer to do 3rd ack retransmissions
@@ -530,7 +517,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return ret;
}
- ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+ if (subflow->use_64bit_ack) {
+ ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+ opts->ext_copy.data_ack = msk->ack_seq;
+ opts->ext_copy.ack64 = 1;
+ } else {
+ ack_size = TCPOLEN_MPTCP_DSS_ACK32;
+ opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq);
+ opts->ext_copy.ack64 = 0;
+ }
+ opts->ext_copy.use_ack = 1;
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
@@ -538,10 +534,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
dss_size += ack_size;
- opts->ext_copy.data_ack = msk->ack_seq;
- opts->ext_copy.ack64 = 1;
- opts->ext_copy.use_ack = 1;
-
*size = ALIGN(dss_size, 4);
return true;
}
@@ -549,7 +541,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
struct in_addr *addr)
{
- u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 hmac[SHA256_DIGEST_SIZE];
u8 msg[7];
msg[0] = addr_id;
@@ -559,14 +551,14 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
- return get_unaligned_be64(hmac);
+ return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
struct in6_addr *addr)
{
- u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 hmac[SHA256_DIGEST_SIZE];
u8 msg[19];
msg[0] = addr_id;
@@ -576,7 +568,7 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
- return get_unaligned_be64(hmac);
+ return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
}
#endif
@@ -709,7 +701,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
return subflow->mp_capable;
- if (mp_opt->use_ack) {
+ if (mp_opt->dss && mp_opt->use_ack) {
/* subflows are fully established as soon as we get any
* additional ack.
*/
@@ -717,8 +709,6 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
goto fully_established;
}
- WARN_ON_ONCE(subflow->can_ack);
-
/* If the first established packet does not contain MP_CAPABLE + data
* then fallback to TCP
*/
@@ -728,6 +718,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
return false;
}
+ if (unlikely(!READ_ONCE(msk->pm.server_side)))
+ pr_warn_once("bogus mpc option on established client sk");
subflow->fully_established = 1;
subflow->remote_key = mp_opt->sndr_key;
subflow->can_ack = 1;
@@ -819,41 +811,41 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- struct mptcp_options_received *mp_opt;
+ struct mptcp_options_received mp_opt;
struct mptcp_ext *mpext;
- mp_opt = &opt_rx->mptcp;
- if (!check_fully_established(msk, sk, subflow, skb, mp_opt))
+ mptcp_get_options(skb, &mp_opt);
+ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
return;
- if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) {
+ if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
struct mptcp_addr_info addr;
- addr.port = htons(mp_opt->port);
- addr.id = mp_opt->addr_id;
- if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ addr.port = htons(mp_opt.port);
+ addr.id = mp_opt.addr_id;
+ if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) {
addr.family = AF_INET;
- addr.addr = mp_opt->addr;
+ addr.addr = mp_opt.addr;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) {
+ else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) {
addr.family = AF_INET6;
- addr.addr6 = mp_opt->addr6;
+ addr.addr6 = mp_opt.addr6;
}
#endif
- if (!mp_opt->echo)
+ if (!mp_opt.echo)
mptcp_pm_add_addr_received(msk, &addr);
- mp_opt->add_addr = 0;
+ mp_opt.add_addr = 0;
}
- if (!mp_opt->dss)
+ if (!mp_opt.dss)
return;
/* we can't wait for recvmsg() to update the ack_seq, otherwise
* monodirectional flows will stuck
*/
- if (mp_opt->use_ack)
- update_una(msk, mp_opt);
+ if (mp_opt.use_ack)
+ update_una(msk, &mp_opt);
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
@@ -861,8 +853,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
memset(mpext, 0, sizeof(*mpext));
- if (mp_opt->use_map) {
- if (mp_opt->mpc_map) {
+ if (mp_opt.use_map) {
+ if (mp_opt.mpc_map) {
/* this is an MP_CAPABLE carrying MPTCP data
* we know this map the first chunk of data
*/
@@ -872,16 +864,16 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
mpext->subflow_seq = 1;
mpext->dsn64 = 1;
mpext->mpc_map = 1;
+ mpext->data_fin = 0;
} else {
- mpext->data_seq = mp_opt->data_seq;
- mpext->subflow_seq = mp_opt->subflow_seq;
- mpext->dsn64 = mp_opt->dsn64;
+ mpext->data_seq = mp_opt.data_seq;
+ mpext->subflow_seq = mp_opt.subflow_seq;
+ mpext->dsn64 = mp_opt.dsn64;
+ mpext->data_fin = mp_opt.data_fin;
}
- mpext->data_len = mp_opt->data_len;
+ mpext->data_len = mp_opt.data_len;
mpext->use_map = 1;
}
-
- mpext->data_fin = mp_opt->data_fin;
}
void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
@@ -1000,8 +992,13 @@ mp_capable_done:
u8 flags = 0;
if (mpext->use_ack) {
- len += TCPOLEN_MPTCP_DSS_ACK64;
- flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+ flags = MPTCP_DSS_HAS_ACK;
+ if (mpext->ack64) {
+ len += TCPOLEN_MPTCP_DSS_ACK64;
+ flags |= MPTCP_DSS_ACK64;
+ } else {
+ len += TCPOLEN_MPTCP_DSS_ACK32;
+ }
}
if (mpext->use_map) {
@@ -1018,8 +1015,13 @@ mp_capable_done:
*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
- put_unaligned_be64(mpext->data_ack, ptr);
- ptr += 2;
+ if (mpext->ack64) {
+ put_unaligned_be64(mpext->data_ack, ptr);
+ ptr += 2;
+ } else {
+ put_unaligned_be32(mpext->data_ack32, ptr);
+ ptr += 1;
+ }
}
if (mpext->use_map) {
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 86d61ab34c7c..b78edf237ba0 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -599,12 +599,14 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
goto nla_put_failure;
- if (addr->family == AF_INET)
- nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
- addr->addr.s_addr);
+ if (addr->family == AF_INET &&
+ nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
+ addr->addr.s_addr))
+ goto nla_put_failure;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else if (addr->family == AF_INET6)
- nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6);
+ else if (addr->family == AF_INET6 &&
+ nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6))
+ goto nla_put_failure;
#endif
nla_nest_end(skb, attr);
return 0;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 939a5045181a..14b253d10ccf 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -97,12 +97,7 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
if (likely(!__mptcp_needs_tcp_fallback(msk)))
return NULL;
- if (msk->subflow) {
- release_sock((struct sock *)msk);
- return msk->subflow;
- }
-
- return NULL;
+ return msk->subflow;
}
static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
@@ -149,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
unsigned int offset, size_t copy_len)
{
struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
__skb_unlink(skb, &ssk->sk_receive_queue);
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_ext_reset(skb);
+ skb_orphan(skb);
msk->ack_seq += copy_len;
+
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (offset == 0 && tail) {
+ bool fragstolen;
+ int delta;
+
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ kfree_skb_partial(skb, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return;
+ }
+ }
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
MPTCP_SKB_CB(skb)->offset = offset;
}
@@ -372,8 +384,10 @@ static void mptcp_stop_timer(struct sock *sk)
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
+ const struct sock *sk = (const struct sock *)msk;
+
if (!msk->cached_ext)
- msk->cached_ext = __skb_ext_alloc();
+ msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
return !!msk->cached_ext;
}
@@ -515,20 +529,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* fooled into a warning if we don't init here
*/
pfrag = sk_page_frag(sk);
- while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
- !mptcp_ext_cache_refill(msk)) {
- ret = sk_stream_wait_memory(ssk, timeo);
- if (ret)
- return ret;
-
- /* if sk_stream_wait_memory() sleeps snd_una can change
- * significantly, refresh the rtx queue
- */
- mptcp_clean_una(sk);
-
- if (unlikely(__mptcp_needs_tcp_fallback(msk)))
- return 0;
- }
if (!retransmission) {
write_seq = &msk->write_seq;
page = pfrag->page;
@@ -595,7 +595,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* access the skb after the sendpages call
*/
ret = do_tcp_sendpages(ssk, page, offset, psize,
- msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+ msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
if (ret <= 0)
return ret;
@@ -658,6 +658,15 @@ out:
return ret;
}
+static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+{
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+
+ /* enables sk->write_space() callbacks */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+}
+
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -665,19 +674,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (!mptcp_ext_cache_refill(msk))
+ return NULL;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!sk_stream_memory_free(ssk)) {
struct socket *sock = ssk->sk_socket;
- if (sock) {
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
-
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
+ if (sock)
+ mptcp_nospace(msk, sock);
return NULL;
}
@@ -703,22 +710,19 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
return;
sock = READ_ONCE(ssk->sk_socket);
-
- if (sock) {
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
- /* set NOSPACE only after clearing SEND_SPACE flag */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
+ if (sock)
+ mptcp_nospace(msk, sock);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct page_frag *pfrag;
struct socket *ssock;
size_t copied = 0;
struct sock *ssk;
+ bool tx_ok;
long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
@@ -734,19 +738,38 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto out;
}
+fallback:
ssock = __mptcp_tcp_fallback(msk);
if (unlikely(ssock)) {
-fallback:
+ release_sock(sk);
pr_debug("fallback passthrough");
ret = sock_sendmsg(ssock, msg);
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}
+ pfrag = sk_page_frag(sk);
+restart:
mptcp_clean_una(sk);
+wait_for_sndbuf:
__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
- while (!sk_stream_memory_free(sk) || !ssk) {
+ while (!sk_stream_memory_free(sk) ||
+ !ssk ||
+ !mptcp_page_frag_refill(ssk, pfrag)) {
+ if (ssk) {
+ /* make sure retransmit timer is
+ * running before we wait for memory.
+ *
+ * The retransmit timer might be needed
+ * to make the peer send an up-to-date
+ * MPTCP Ack.
+ */
+ mptcp_set_timeout(sk, ssk);
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ }
+
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
@@ -763,18 +786,75 @@ fallback:
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
- while (msg_data_left(msg)) {
+ tx_ok = msg_data_left(msg);
+ while (tx_ok) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -EAGAIN && timeo > 0) {
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto restart;
+ }
break;
+ }
if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
+ /* Can happen for passive sockets:
+ * 3WHS negotiated MPTCP, but first packet after is
+ * plain TCP (e.g. due to middlebox filtering unknown
+ * options).
+ *
+ * Fall back to TCP.
+ */
release_sock(ssk);
- ssock = __mptcp_tcp_fallback(msk);
goto fallback;
}
copied += ret;
+
+ tx_ok = msg_data_left(msg);
+ if (!tx_ok)
+ break;
+
+ if (!sk_stream_memory_free(ssk) ||
+ !mptcp_page_frag_refill(ssk, pfrag) ||
+ !mptcp_ext_cache_refill(msk)) {
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ tcp_push(ssk, msg->msg_flags, mss_now,
+ tcp_sk(ssk)->nonagle, size_goal);
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto restart;
+ }
+
+ /* memory is charged to mptcp level socket as well, i.e.
+ * if msg is very large, mptcp socket may run out of buffer
+ * space. mptcp_clean_una() will release data that has
+ * been acked at mptcp level in the mean time, so there is
+ * a good chance we can continue sending data right away.
+ *
+ * Normally, when the tcp subflow can accept more data, then
+ * so can the MPTCP socket. However, we need to cope with
+ * peers that might lag behind in their MPTCP-level
+ * acknowledgements, i.e. data might have been acked at
+ * tcp level only. So, we must also check the MPTCP socket
+ * limits before we send more data.
+ */
+ if (unlikely(!sk_stream_memory_free(sk))) {
+ tcp_push(ssk, msg->msg_flags, mss_now,
+ tcp_sk(ssk)->nonagle, size_goal);
+ mptcp_clean_una(sk);
+ if (!sk_stream_memory_free(sk)) {
+ /* can't send more for now, need to wait for
+ * MPTCP-level ACKs from peer.
+ *
+ * Wakeup will happen via mptcp_clean_una().
+ */
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto wait_for_sndbuf;
+ }
+ }
}
mptcp_set_timeout(sk, ssk);
@@ -883,6 +963,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
ssock = __mptcp_tcp_fallback(msk);
if (unlikely(ssock)) {
fallback:
+ release_sock(sk);
pr_debug("fallback-read subflow=%p",
mptcp_subflow_ctx(ssock->sk));
copied = sock_recvmsg(ssock, msg, flags);
@@ -951,7 +1032,8 @@ fallback:
pr_debug("block timeout %ld", timeo);
mptcp_wait_data(sk, &timeo);
- if (unlikely(__mptcp_tcp_fallback(msk)))
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
goto fallback;
}
@@ -1091,7 +1173,7 @@ static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
- int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+ int orig_len, orig_offset, mss_now = 0, size_goal = 0;
struct mptcp_data_frag *dfrag;
u64 orig_write_seq;
size_t copied = 0;
@@ -1113,6 +1195,9 @@ static void mptcp_worker(struct work_struct *work)
if (!dfrag)
goto unlock;
+ if (!mptcp_ext_cache_refill(msk))
+ goto reset_unlock;
+
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
goto reset_unlock;
@@ -1124,8 +1209,8 @@ static void mptcp_worker(struct work_struct *work)
orig_offset = dfrag->offset;
orig_write_seq = dfrag->data_seq;
while (dfrag->data_len > 0) {
- ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
- &size_goal);
+ int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
+ &mss_now, &size_goal);
if (ret < 0)
break;
@@ -1133,6 +1218,9 @@ static void mptcp_worker(struct work_struct *work)
copied += ret;
dfrag->data_len -= ret;
dfrag->offset += ret;
+
+ if (!mptcp_ext_cache_refill(msk))
+ break;
}
if (copied)
tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
@@ -1259,11 +1347,14 @@ static void mptcp_close(struct sock *sk, long timeout)
lock_sock(sk);
- mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE);
- __mptcp_flush_join_list(msk);
-
+ /* be sure to always acquire the join list lock, to sync vs
+ * mptcp_finish_join().
+ */
+ spin_lock_bh(&msk->join_list_lock);
+ list_splice_tail_init(&msk->join_list, &msk->conn_list);
+ spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list);
data_fin_tx_seq = msk->write_seq;
@@ -1313,11 +1404,12 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
static int mptcp_disconnect(struct sock *sk, int flags)
{
- lock_sock(sk);
- __mptcp_clear_xmit(sk);
- release_sock(sk);
- mptcp_cancel_work(sk);
- return tcp_disconnect(sk, flags);
+ /* Should never be called.
+ * inet_stream_connect() calls ->disconnect, but that
+ * refers to the subflow socket, not the mptcp one.
+ */
+ WARN_ON_ONCE(1);
+ return 0;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -1329,7 +1421,9 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
}
#endif
-struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
+struct sock *mptcp_sk_clone(const struct sock *sk,
+ const struct mptcp_options_received *mp_opt,
+ struct request_sock *req)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
@@ -1352,26 +1446,30 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
msk->subflow = NULL;
if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) {
+ nsk->sk_state = TCP_CLOSE;
bh_unlock_sock(nsk);
/* we can't call into mptcp_close() here - possible BH context
- * free the sock directly
+ * free the sock directly.
+ * sk_clone_lock() sets nsk refcnt to two, hence call sk_free()
+ * too.
*/
- nsk->sk_prot->destroy(nsk);
+ sk_common_release(nsk);
sk_free(nsk);
return NULL;
}
msk->write_seq = subflow_req->idsn + 1;
atomic64_set(&msk->snd_una, msk->write_seq);
- if (subflow_req->remote_key_valid) {
+ if (mp_opt->mp_capable) {
msk->can_ack = true;
- msk->remote_key = subflow_req->remote_key;
+ msk->remote_key = mp_opt->sndr_key;
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++;
msk->ack_seq = ack_seq;
}
+ sock_reset_flag(nsk, SOCK_RCU_FREE);
/* will be fully established after successful MPC subflow creation */
inet_sk_state_store(nsk, TCP_SYN_RECV);
bh_unlock_sock(nsk);
@@ -1428,6 +1526,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
+ inet_sk_state_store(newsk, TCP_ESTABLISHED);
bh_unlock_sock(new_mptcp_sock);
@@ -1445,6 +1544,7 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ mptcp_token_destroy(msk->token);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
@@ -1467,12 +1567,11 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname,
*/
lock_sock(sk);
ssock = __mptcp_tcp_fallback(msk);
+ release_sock(sk);
if (ssock)
return tcp_setsockopt(ssock->sk, level, optname, optval,
optlen);
- release_sock(sk);
-
return -EOPNOTSUPP;
}
@@ -1492,12 +1591,11 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
*/
lock_sock(sk);
ssock = __mptcp_tcp_fallback(msk);
+ release_sock(sk);
if (ssock)
return tcp_getsockopt(ssock->sk, level, optname, optval,
option);
- release_sock(sk);
-
return -EOPNOTSUPP;
}
@@ -1613,27 +1711,30 @@ bool mptcp_finish_join(struct sock *sk)
if (!msk->pm.server_side)
return true;
- /* passive connection, attach to msk socket */
+ if (!mptcp_pm_allow_new_subflow(msk))
+ return false;
+
+ /* active connections are already on conn_list, and we can't acquire
+ * msk lock here.
+ * use the join list lock as synchronization point and double-check
+ * msk status to avoid racing with mptcp_close()
+ */
+ spin_lock_bh(&msk->join_list_lock);
+ ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
+ if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node)))
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+ if (!ret)
+ return false;
+
+ /* attach to msk socket only after we are sure he will deal with us
+ * at close time
+ */
parent_sock = READ_ONCE(parent->sk_socket);
if (parent_sock && !sk->sk_socket)
mptcp_sock_graft(sk, parent_sock);
-
- ret = mptcp_pm_allow_new_subflow(msk);
- if (ret) {
- /* active connections are already on conn_list */
- spin_lock_bh(&msk->join_list_lock);
- if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
- list_add_tail(&subflow->node, &msk->join_list);
- spin_unlock_bh(&msk->join_list_lock);
- }
- return ret;
-}
-
-bool mptcp_sk_is_subflow(const struct sock *sk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-
- return subflow->mp_join == 1;
+ subflow->map_seq = msk->ack_seq;
+ return true;
}
static bool mptcp_memory_free(const struct sock *sk, int wake)
@@ -1700,6 +1801,14 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
lock_sock(sock->sk);
+ if (sock->state != SS_UNCONNECTED && msk->subflow) {
+ /* pending connection or invalid state, let existing subflow
+ * cope with that
+ */
+ ssock = msk->subflow;
+ goto do_connect;
+ }
+
ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
if (IS_ERR(ssock)) {
err = PTR_ERR(ssock);
@@ -1714,9 +1823,17 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
#endif
+do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ sock->state = ssock->state;
+
+ /* on successful connect, the msk state will be moved to established by
+ * subflow_finish_connect()
+ */
+ if (!err || err == EINPROGRESS)
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ else
+ inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
unlock:
release_sock(sock->sk);
@@ -1774,6 +1891,8 @@ static int mptcp_listen(struct socket *sock, int backlog)
goto unlock;
}
+ sock_set_flag(sock->sk, SOCK_RCU_FREE);
+
err = ssock->ops->listen(ssock, backlog);
inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
if (!err)
@@ -1995,6 +2114,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
+ .compat_ioctl = inet6_compat_ioctl,
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 67448002a2d7..809687d3f410 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -81,7 +81,6 @@
/* MPTCP ADD_ADDR flags */
#define MPTCP_ADDR_ECHO BIT(0)
-#define MPTCP_ADDR_HMAC_LEN 20
#define MPTCP_ADDR_IPVERSION_4 4
#define MPTCP_ADDR_IPVERSION_6 6
@@ -91,6 +90,45 @@
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
+struct mptcp_options_received {
+ u64 sndr_key;
+ u64 rcvr_key;
+ u64 data_ack;
+ u64 data_seq;
+ u32 subflow_seq;
+ u16 data_len;
+ u16 mp_capable : 1,
+ mp_join : 1,
+ dss : 1,
+ add_addr : 1,
+ rm_addr : 1,
+ family : 4,
+ echo : 1,
+ backup : 1;
+ u32 token;
+ u32 nonce;
+ u64 thmac;
+ u8 hmac[20];
+ u8 join_id;
+ u8 use_map:1,
+ dsn64:1,
+ data_fin:1,
+ use_ack:1,
+ ack64:1,
+ mpc_map:1,
+ __unused:2;
+ u8 addr_id;
+ u8 rm_id;
+ union {
+ struct in_addr addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ struct in6_addr addr6;
+#endif
+ };
+ u64 ahmac;
+ u16 port;
+};
+
static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
@@ -206,12 +244,10 @@ struct mptcp_subflow_request_sock {
struct tcp_request_sock sk;
u16 mp_capable : 1,
mp_join : 1,
- backup : 1,
- remote_key_valid : 1;
+ backup : 1;
u8 local_id;
u8 remote_id;
u64 local_key;
- u64 remote_key;
u64 idsn;
u32 token;
u32 ssn_offset;
@@ -253,6 +289,7 @@ struct mptcp_subflow_context {
data_avail : 1,
rx_eof : 1,
data_fin_tx_enable : 1,
+ use_64bit_ack : 1, /* Set when we received a 64-bit DSN */
can_ack : 1; /* only after processing the remote a key */
u64 data_fin_tx_seq;
u32 remote_nonce;
@@ -332,9 +369,11 @@ void mptcp_proto_init(void);
int mptcp_proto_v6_init(void);
#endif
-struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req);
+struct sock *mptcp_sk_clone(const struct sock *sk,
+ const struct mptcp_options_received *mp_opt,
+ struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
- struct tcp_options_received *opt_rx);
+ struct mptcp_options_received *mp_opt);
void mptcp_finish_connect(struct sock *sk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 50a8bea987c6..493b98a0825c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <crypto/algapi.h>
+#include <crypto/sha.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -89,7 +90,7 @@ static bool subflow_token_join_request(struct request_sock *req,
const struct sk_buff *skb)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- u8 hmac[MPTCPOPT_HMAC_LEN];
+ u8 hmac[SHA256_DIGEST_SIZE];
struct mptcp_sock *msk;
int local_id;
@@ -124,16 +125,14 @@ static void subflow_init_req(struct request_sock *req,
{
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
- struct tcp_options_received rx_opt;
+ struct mptcp_options_received mp_opt;
pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
- memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
- mptcp_get_options(skb, &rx_opt);
+ mptcp_get_options(skb, &mp_opt);
subflow_req->mp_capable = 0;
subflow_req->mp_join = 0;
- subflow_req->remote_key_valid = 0;
#ifdef CONFIG_TCP_MD5SIG
/* no MPTCP if MD5SIG is enabled on this socket or we may run out of
@@ -143,16 +142,16 @@ static void subflow_init_req(struct request_sock *req,
return;
#endif
- if (rx_opt.mptcp.mp_capable) {
+ if (mp_opt.mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
- if (rx_opt.mptcp.mp_join)
+ if (mp_opt.mp_join)
return;
- } else if (rx_opt.mptcp.mp_join) {
+ } else if (mp_opt.mp_join) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
}
- if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
+ if (mp_opt.mp_capable && listener->request_mptcp) {
int err;
err = mptcp_token_new_request(req);
@@ -160,13 +159,13 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 1;
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
- } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+ } else if (mp_opt.mp_join && listener->request_mptcp) {
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
subflow_req->mp_join = 1;
- subflow_req->backup = rx_opt.mptcp.backup;
- subflow_req->remote_id = rx_opt.mptcp.join_id;
- subflow_req->token = rx_opt.mptcp.token;
- subflow_req->remote_nonce = rx_opt.mptcp.nonce;
+ subflow_req->backup = mp_opt.backup;
+ subflow_req->remote_id = mp_opt.join_id;
+ subflow_req->token = mp_opt.token;
+ subflow_req->remote_nonce = mp_opt.nonce;
pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
subflow_req->remote_nonce);
if (!subflow_token_join_request(req, skb)) {
@@ -203,7 +202,7 @@ static void subflow_v6_init_req(struct request_sock *req,
/* validate received truncated hmac and create hmac for third ACK */
static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
{
- u8 hmac[MPTCPOPT_HMAC_LEN];
+ u8 hmac[SHA256_DIGEST_SIZE];
u64 thmac;
subflow_generate_hmac(subflow->remote_key, subflow->local_key,
@@ -222,29 +221,55 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_options_received mp_opt;
struct sock *parent = subflow->conn;
+ struct tcp_sock *tp = tcp_sk(sk);
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
- if (inet_sk_state_load(parent) != TCP_ESTABLISHED) {
+ if (inet_sk_state_load(parent) == TCP_SYN_SENT) {
inet_sk_state_store(parent, TCP_ESTABLISHED);
parent->sk_state_change(parent);
}
- if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
+ /* be sure no special action on any packet other than syn-ack */
+ if (subflow->conn_finished)
+ return;
+
+ subflow->conn_finished = 1;
+
+ mptcp_get_options(skb, &mp_opt);
+ if (subflow->request_mptcp && mp_opt.mp_capable) {
+ subflow->mp_capable = 1;
+ subflow->can_ack = 1;
+ subflow->remote_key = mp_opt.sndr_key;
+ pr_debug("subflow=%p, remote_key=%llu", subflow,
+ subflow->remote_key);
+ } else if (subflow->request_join && mp_opt.mp_join) {
+ subflow->mp_join = 1;
+ subflow->thmac = mp_opt.thmac;
+ subflow->remote_nonce = mp_opt.nonce;
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
+ subflow->thmac, subflow->remote_nonce);
+ } else if (subflow->request_mptcp) {
+ tp->is_mptcp = 0;
+ }
+
+ if (!tp->is_mptcp)
return;
if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key);
mptcp_finish_connect(sk);
- subflow->conn_finished = 1;
if (skb) {
pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
}
} else if (subflow->mp_join) {
+ u8 hmac[SHA256_DIGEST_SIZE];
+
pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
subflow, subflow->thmac,
subflow->remote_nonce);
@@ -257,7 +282,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow_generate_hmac(subflow->local_key, subflow->remote_key,
subflow->local_nonce,
subflow->remote_nonce,
- subflow->hmac);
+ hmac);
+
+ memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
if (skb)
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -265,7 +292,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (!mptcp_finish_join(sk))
goto do_reset;
- subflow->conn_finished = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
} else {
do_reset:
@@ -323,10 +349,10 @@ drop:
/* validate hmac received in third ACK */
static bool subflow_hmac_valid(const struct request_sock *req,
- const struct tcp_options_received *rx_opt)
+ const struct mptcp_options_received *mp_opt)
{
const struct mptcp_subflow_request_sock *subflow_req;
- u8 hmac[MPTCPOPT_HMAC_LEN];
+ u8 hmac[SHA256_DIGEST_SIZE];
struct mptcp_sock *msk;
bool ret;
@@ -340,13 +366,67 @@ static bool subflow_hmac_valid(const struct request_sock *req,
subflow_req->local_nonce, hmac);
ret = true;
- if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
+ if (crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN))
ret = false;
sock_put((struct sock *)msk);
return ret;
}
+static void mptcp_sock_destruct(struct sock *sk)
+{
+ /* if new mptcp socket isn't accepted, it is free'd
+ * from the tcp listener sockets request queue, linked
+ * from req->sk. The tcp socket is released.
+ * This calls the ULP release function which will
+ * also remove the mptcp socket, via
+ * sock_put(ctx->conn).
+ *
+ * Problem is that the mptcp socket will not be in
+ * SYN_RECV state and doesn't have SOCK_DEAD flag.
+ * Both result in warnings from inet_sock_destruct.
+ */
+
+ if (sk->sk_state == TCP_SYN_RECV) {
+ sk->sk_state = TCP_CLOSE;
+ WARN_ON_ONCE(sk->sk_socket);
+ sock_orphan(sk);
+ }
+
+ inet_sock_destruct(sk);
+}
+
+static void mptcp_force_close(struct sock *sk)
+{
+ inet_sk_state_store(sk, TCP_CLOSE);
+ sk_common_release(sk);
+}
+
+static void subflow_ulp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *old_ctx)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ mptcp_subflow_tcp_fallback(sk, old_ctx);
+ icsk->icsk_ulp_ops = NULL;
+ rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+ tcp_sk(sk)->is_mptcp = 0;
+}
+
+static void subflow_drop_ctx(struct sock *ssk)
+{
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
+
+ if (!ctx)
+ return;
+
+ subflow_ulp_fallback(ssk, ctx);
+ if (ctx->conn)
+ sock_put(ctx->conn);
+
+ kfree_rcu(ctx, rcu);
+}
+
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
@@ -356,13 +436,18 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
{
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
struct mptcp_subflow_request_sock *subflow_req;
- struct tcp_options_received opt_rx;
+ struct mptcp_options_received mp_opt;
bool fallback_is_fatal = false;
struct sock *new_msk = NULL;
+ bool fallback = false;
struct sock *child;
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+ /* we need later a valid 'mp_capable' value even when options are not
+ * parsed
+ */
+ mp_opt.mp_capable = 0;
if (tcp_rsk(req)->is_mptcp == 0)
goto create_child;
@@ -377,26 +462,21 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
goto create_msk;
}
- opt_rx.mptcp.mp_capable = 0;
- mptcp_get_options(skb, &opt_rx);
- if (opt_rx.mptcp.mp_capable) {
- subflow_req->remote_key = opt_rx.mptcp.sndr_key;
- subflow_req->remote_key_valid = 1;
- } else {
- subflow_req->mp_capable = 0;
+ mptcp_get_options(skb, &mp_opt);
+ if (!mp_opt.mp_capable) {
+ fallback = true;
goto create_child;
}
create_msk:
- new_msk = mptcp_sk_clone(listener->conn, req);
+ new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req);
if (!new_msk)
- subflow_req->mp_capable = 0;
+ fallback = true;
} else if (subflow_req->mp_join) {
fallback_is_fatal = true;
- opt_rx.mptcp.mp_join = 0;
- mptcp_get_options(skb, &opt_rx);
- if (!opt_rx.mptcp.mp_join ||
- !subflow_hmac_valid(req, &opt_rx)) {
+ mptcp_get_options(skb, &mp_opt);
+ if (!mp_opt.mp_join ||
+ !subflow_hmac_valid(req, &mp_opt)) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
return NULL;
}
@@ -409,12 +489,17 @@ create_child:
if (child && *own_req) {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
- /* we have null ctx on TCP fallback, which is fatal on
- * MPJ handshake
+ tcp_rsk(req)->drop_req = false;
+
+ /* we need to fallback on ctx allocation failure and on pre-reqs
+ * checking above. In the latter scenario we additionally need
+ * to reset the context to non MPTCP status.
*/
- if (!ctx) {
+ if (!ctx || fallback) {
if (fallback_is_fatal)
- goto close_child;
+ goto dispose_child;
+
+ subflow_drop_ctx(child);
goto out;
}
@@ -422,36 +507,55 @@ create_child:
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
- inet_sk_state_store(new_msk, TCP_ESTABLISHED);
+ new_msk->sk_destruct = mptcp_sock_destruct;
mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
ctx->conn = new_msk;
new_msk = NULL;
+
+ /* with OoO packets we can reach here without ingress
+ * mpc option
+ */
+ ctx->remote_key = mp_opt.sndr_key;
+ ctx->fully_established = mp_opt.mp_capable;
+ ctx->can_ack = mp_opt.mp_capable;
} else if (ctx->mp_join) {
struct mptcp_sock *owner;
owner = mptcp_token_get_sock(ctx->token);
if (!owner)
- goto close_child;
+ goto dispose_child;
ctx->conn = (struct sock *)owner;
if (!mptcp_finish_join(child))
- goto close_child;
+ goto dispose_child;
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
+ tcp_rsk(req)->drop_req = true;
}
}
out:
/* dispose of the left over mptcp master, if any */
if (unlikely(new_msk))
- sock_put(new_msk);
+ mptcp_force_close(new_msk);
+
+ /* check for expected invariant - should never trigger, just help
+ * catching eariler subtle bugs
+ */
+ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
+ (!mptcp_subflow_ctx(child) ||
+ !mptcp_subflow_ctx(child)->conn));
return child;
-close_child:
+dispose_child:
+ subflow_drop_ctx(child);
+ tcp_rsk(req)->drop_req = true;
tcp_send_active_reset(child, GFP_ATOMIC);
- inet_csk_prepare_forced_close(child);
+ inet_csk_prepare_for_destroy_sock(child);
tcp_done(child);
- return NULL;
+
+ /* The last child reference will be released by the caller */
+ return child;
}
static struct inet_connection_sock_af_ops subflow_specific;
@@ -580,9 +684,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
if (!mpext->dsn64) {
map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
mpext->data_seq);
+ subflow->use_64bit_ack = 0;
pr_debug("expanded seq=%llu", subflow->map_seq);
} else {
map_seq = mpext->data_seq;
+ subflow->use_64bit_ack = 1;
}
if (subflow->map_valid) {
@@ -764,6 +870,24 @@ bool mptcp_subflow_data_available(struct sock *sk)
return subflow->data_avail;
}
+/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
+ * not the ssk one.
+ *
+ * In mptcp, rwin is about the mptcp-level connection data.
+ *
+ * Data that is still on the ssk rx queue can thus be ignored,
+ * as far as mptcp peer is concerened that data is still inflight.
+ * DSS ACK is updated when skb is moved to the mptcp rx queue.
+ */
+void mptcp_space(const struct sock *ssk, int *space, int *full_space)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ const struct sock *sk = subflow->conn;
+
+ *space = tcp_space(sk);
+ *full_space = tcp_full_space(sk);
+}
+
static void subflow_data_ready(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -931,6 +1055,16 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
if (err)
return err;
+ /* the newly created socket really belongs to the owning MPTCP master
+ * socket, even if for additional subflows the allocation is performed
+ * by a kernel workqueue. Adjust inode references, so that the
+ * procfs/diag interaces really show this one belonging to the correct
+ * user.
+ */
+ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
+ SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
+ SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
+
subflow = mptcp_subflow_ctx(sf->sk);
pr_debug("subflow=%p", subflow);
@@ -1047,17 +1181,6 @@ static void subflow_ulp_release(struct sock *sk)
kfree_rcu(ctx, rcu);
}
-static void subflow_ulp_fallback(struct sock *sk,
- struct mptcp_subflow_context *old_ctx)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- mptcp_subflow_tcp_fallback(sk, old_ctx);
- icsk->icsk_ulp_ops = NULL;
- rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
- tcp_sk(sk)->is_mptcp = 0;
-}
-
static void subflow_ulp_clone(const struct request_sock *req,
struct sock *newsk,
const gfp_t priority)
@@ -1091,9 +1214,6 @@ static void subflow_ulp_clone(const struct request_sock *req,
* is fully established only after we receive the remote key
*/
new_ctx->mp_capable = 1;
- new_ctx->fully_established = subflow_req->remote_key_valid;
- new_ctx->can_ack = subflow_req->remote_key_valid;
- new_ctx->remote_key = subflow_req->remote_key;
new_ctx->local_key = subflow_req->local_key;
new_ctx->token = subflow_req->token;
new_ctx->ssn_offset = subflow_req->ssn_offset;