aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c233
1 files changed, 171 insertions, 62 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e1f23016ed3f..14b253d10ccf 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -144,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
unsigned int offset, size_t copy_len)
{
struct sock *sk = (struct sock *)msk;
+ struct sk_buff *tail;
__skb_unlink(skb, &ssk->sk_receive_queue);
- skb_set_owner_r(skb, sk);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_ext_reset(skb);
+ skb_orphan(skb);
msk->ack_seq += copy_len;
+
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ if (offset == 0 && tail) {
+ bool fragstolen;
+ int delta;
+
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ kfree_skb_partial(skb, fragstolen);
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ return;
+ }
+ }
+
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
MPTCP_SKB_CB(skb)->offset = offset;
}
@@ -367,8 +384,10 @@ static void mptcp_stop_timer(struct sock *sk)
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
+ const struct sock *sk = (const struct sock *)msk;
+
if (!msk->cached_ext)
- msk->cached_ext = __skb_ext_alloc();
+ msk->cached_ext = __skb_ext_alloc(sk->sk_allocation);
return !!msk->cached_ext;
}
@@ -510,20 +529,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* fooled into a warning if we don't init here
*/
pfrag = sk_page_frag(sk);
- while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
- !mptcp_ext_cache_refill(msk)) {
- ret = sk_stream_wait_memory(ssk, timeo);
- if (ret)
- return ret;
-
- /* if sk_stream_wait_memory() sleeps snd_una can change
- * significantly, refresh the rtx queue
- */
- mptcp_clean_una(sk);
-
- if (unlikely(__mptcp_needs_tcp_fallback(msk)))
- return 0;
- }
if (!retransmission) {
write_seq = &msk->write_seq;
page = pfrag->page;
@@ -590,7 +595,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* access the skb after the sendpages call
*/
ret = do_tcp_sendpages(ssk, page, offset, psize,
- msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+ msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT);
if (ret <= 0)
return ret;
@@ -653,6 +658,15 @@ out:
return ret;
}
+static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
+{
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
+
+ /* enables sk->write_space() callbacks */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+}
+
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -660,19 +674,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
sock_owned_by_me((const struct sock *)msk);
+ if (!mptcp_ext_cache_refill(msk))
+ return NULL;
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!sk_stream_memory_free(ssk)) {
struct socket *sock = ssk->sk_socket;
- if (sock) {
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
-
- /* enables sk->write_space() callbacks */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
+ if (sock)
+ mptcp_nospace(msk, sock);
return NULL;
}
@@ -698,22 +710,19 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
return;
sock = READ_ONCE(ssk->sk_socket);
-
- if (sock) {
- clear_bit(MPTCP_SEND_SPACE, &msk->flags);
- smp_mb__after_atomic();
- /* set NOSPACE only after clearing SEND_SPACE flag */
- set_bit(SOCK_NOSPACE, &sock->flags);
- }
+ if (sock)
+ mptcp_nospace(msk, sock);
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
int mss_now = 0, size_goal = 0, ret = 0;
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct page_frag *pfrag;
struct socket *ssock;
size_t copied = 0;
struct sock *ssk;
+ bool tx_ok;
long timeo;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
@@ -738,11 +747,29 @@ fallback:
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}
+ pfrag = sk_page_frag(sk);
+restart:
mptcp_clean_una(sk);
+wait_for_sndbuf:
__mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
- while (!sk_stream_memory_free(sk) || !ssk) {
+ while (!sk_stream_memory_free(sk) ||
+ !ssk ||
+ !mptcp_page_frag_refill(ssk, pfrag)) {
+ if (ssk) {
+ /* make sure retransmit timer is
+ * running before we wait for memory.
+ *
+ * The retransmit timer might be needed
+ * to make the peer send an up-to-date
+ * MPTCP Ack.
+ */
+ mptcp_set_timeout(sk, ssk);
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ }
+
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
goto out;
@@ -759,11 +786,18 @@ fallback:
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
- while (msg_data_left(msg)) {
+ tx_ok = msg_data_left(msg);
+ while (tx_ok) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -EAGAIN && timeo > 0) {
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto restart;
+ }
break;
+ }
if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
/* Can happen for passive sockets:
* 3WHS negotiated MPTCP, but first packet after is
@@ -777,6 +811,50 @@ fallback:
}
copied += ret;
+
+ tx_ok = msg_data_left(msg);
+ if (!tx_ok)
+ break;
+
+ if (!sk_stream_memory_free(ssk) ||
+ !mptcp_page_frag_refill(ssk, pfrag) ||
+ !mptcp_ext_cache_refill(msk)) {
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ tcp_push(ssk, msg->msg_flags, mss_now,
+ tcp_sk(ssk)->nonagle, size_goal);
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto restart;
+ }
+
+ /* memory is charged to mptcp level socket as well, i.e.
+ * if msg is very large, mptcp socket may run out of buffer
+ * space. mptcp_clean_una() will release data that has
+ * been acked at mptcp level in the mean time, so there is
+ * a good chance we can continue sending data right away.
+ *
+ * Normally, when the tcp subflow can accept more data, then
+ * so can the MPTCP socket. However, we need to cope with
+ * peers that might lag behind in their MPTCP-level
+ * acknowledgements, i.e. data might have been acked at
+ * tcp level only. So, we must also check the MPTCP socket
+ * limits before we send more data.
+ */
+ if (unlikely(!sk_stream_memory_free(sk))) {
+ tcp_push(ssk, msg->msg_flags, mss_now,
+ tcp_sk(ssk)->nonagle, size_goal);
+ mptcp_clean_una(sk);
+ if (!sk_stream_memory_free(sk)) {
+ /* can't send more for now, need to wait for
+ * MPTCP-level ACKs from peer.
+ *
+ * Wakeup will happen via mptcp_clean_una().
+ */
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+ goto wait_for_sndbuf;
+ }
+ }
}
mptcp_set_timeout(sk, ssk);
@@ -954,7 +1032,8 @@ fallback:
pr_debug("block timeout %ld", timeo);
mptcp_wait_data(sk, &timeo);
- if (unlikely(__mptcp_tcp_fallback(msk)))
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
goto fallback;
}
@@ -1094,7 +1173,7 @@ static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
- int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+ int orig_len, orig_offset, mss_now = 0, size_goal = 0;
struct mptcp_data_frag *dfrag;
u64 orig_write_seq;
size_t copied = 0;
@@ -1116,6 +1195,9 @@ static void mptcp_worker(struct work_struct *work)
if (!dfrag)
goto unlock;
+ if (!mptcp_ext_cache_refill(msk))
+ goto reset_unlock;
+
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
goto reset_unlock;
@@ -1127,8 +1209,8 @@ static void mptcp_worker(struct work_struct *work)
orig_offset = dfrag->offset;
orig_write_seq = dfrag->data_seq;
while (dfrag->data_len > 0) {
- ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
- &size_goal);
+ int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo,
+ &mss_now, &size_goal);
if (ret < 0)
break;
@@ -1136,6 +1218,9 @@ static void mptcp_worker(struct work_struct *work)
copied += ret;
dfrag->data_len -= ret;
dfrag->offset += ret;
+
+ if (!mptcp_ext_cache_refill(msk))
+ break;
}
if (copied)
tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
@@ -1262,11 +1347,14 @@ static void mptcp_close(struct sock *sk, long timeout)
lock_sock(sk);
- mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE);
- __mptcp_flush_join_list(msk);
-
+ /* be sure to always acquire the join list lock, to sync vs
+ * mptcp_finish_join().
+ */
+ spin_lock_bh(&msk->join_list_lock);
+ list_splice_tail_init(&msk->join_list, &msk->conn_list);
+ spin_unlock_bh(&msk->join_list_lock);
list_splice_init(&msk->conn_list, &conn_list);
data_fin_tx_seq = msk->write_seq;
@@ -1456,6 +1544,7 @@ static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ mptcp_token_destroy(msk->token);
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
@@ -1622,27 +1711,30 @@ bool mptcp_finish_join(struct sock *sk)
if (!msk->pm.server_side)
return true;
- /* passive connection, attach to msk socket */
+ if (!mptcp_pm_allow_new_subflow(msk))
+ return false;
+
+ /* active connections are already on conn_list, and we can't acquire
+ * msk lock here.
+ * use the join list lock as synchronization point and double-check
+ * msk status to avoid racing with mptcp_close()
+ */
+ spin_lock_bh(&msk->join_list_lock);
+ ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
+ if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node)))
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+ if (!ret)
+ return false;
+
+ /* attach to msk socket only after we are sure he will deal with us
+ * at close time
+ */
parent_sock = READ_ONCE(parent->sk_socket);
if (parent_sock && !sk->sk_socket)
mptcp_sock_graft(sk, parent_sock);
-
- ret = mptcp_pm_allow_new_subflow(msk);
- if (ret) {
- /* active connections are already on conn_list */
- spin_lock_bh(&msk->join_list_lock);
- if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
- list_add_tail(&subflow->node, &msk->join_list);
- spin_unlock_bh(&msk->join_list_lock);
- }
- return ret;
-}
-
-bool mptcp_sk_is_subflow(const struct sock *sk)
-{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-
- return subflow->mp_join == 1;
+ subflow->map_seq = msk->ack_seq;
+ return true;
}
static bool mptcp_memory_free(const struct sock *sk, int wake)
@@ -1709,6 +1801,14 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
lock_sock(sock->sk);
+ if (sock->state != SS_UNCONNECTED && msk->subflow) {
+ /* pending connection or invalid state, let existing subflow
+ * cope with that
+ */
+ ssock = msk->subflow;
+ goto do_connect;
+ }
+
ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
if (IS_ERR(ssock)) {
err = PTR_ERR(ssock);
@@ -1723,9 +1823,17 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
#endif
+do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ sock->state = ssock->state;
+
+ /* on successful connect, the msk state will be moved to established by
+ * subflow_finish_connect()
+ */
+ if (!err || err == EINPROGRESS)
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+ else
+ inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
unlock:
release_sock(sock->sk);
@@ -2006,6 +2114,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
+ .compat_ioctl = inet6_compat_ioctl,
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif