aboutsummaryrefslogtreecommitdiffstats
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c298
1 files changed, 204 insertions, 94 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a88924947815..2602f1386160 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -411,35 +411,51 @@ static void mptcp_set_datafin_timeout(const struct sock *sk)
TCP_RTO_MIN << icsk->icsk_retransmits);
}
-static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
+static void __mptcp_set_timeout(struct sock *sk, long tout)
{
- long tout = ssk && inet_csk(ssk)->icsk_pending ?
- inet_csk(ssk)->icsk_timeout - jiffies : 0;
-
- if (tout <= 0)
- tout = mptcp_sk(sk)->timer_ival;
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
+static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow)
+{
+ const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+}
+
+static void mptcp_set_timeout(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ long tout = 0;
+
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow)
+ tout = max(tout, mptcp_timeout_from_subflow(subflow));
+ __mptcp_set_timeout(sk, tout);
+}
+
static bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
}
+void mptcp_subflow_send_ack(struct sock *ssk)
+{
+ bool slow;
+
+ slow = lock_sock_fast(ssk);
+ if (tcp_can_send_ack(ssk))
+ tcp_send_ack(ssk);
+ unlock_sock_fast(ssk, slow);
+}
+
static void mptcp_send_ack(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- bool slow;
-
- slow = lock_sock_fast(ssk);
- if (tcp_can_send_ack(ssk))
- tcp_send_ack(ssk);
- unlock_sock_fast(ssk, slow);
- }
+ mptcp_for_each_subflow(msk, subflow)
+ mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow));
}
static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
@@ -531,7 +547,6 @@ static bool mptcp_check_data_fin(struct sock *sk)
}
ret = true;
- mptcp_set_timeout(sk, NULL);
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
@@ -791,10 +806,7 @@ static void mptcp_reset_timer(struct sock *sk)
if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
return;
- /* should never be called with mptcp level timer cleared */
- tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
- if (WARN_ON_ONCE(!tout))
- tout = TCP_RTO_MIN;
+ tout = mptcp_sk(sk)->timer_ival;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}
@@ -994,6 +1006,13 @@ static void mptcp_wmem_uncharge(struct sock *sk, int size)
msk->wmem_reserved += size;
}
+static void __mptcp_mem_reclaim_partial(struct sock *sk)
+{
+ lockdep_assert_held_once(&sk->sk_lock.slock);
+ __mptcp_update_wmem(sk);
+ sk_mem_reclaim_partial(sk);
+}
+
static void mptcp_mem_reclaim_partial(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1046,8 +1065,14 @@ static void __mptcp_clean_una(struct sock *sk)
if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
break;
- if (WARN_ON_ONCE(dfrag == msk->first_pending))
- break;
+ if (unlikely(dfrag == msk->first_pending)) {
+ /* in recovery mode can see ack after the current snd head */
+ if (WARN_ON_ONCE(!msk->recovery))
+ break;
+
+ WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ }
+
dfrag_clear(sk, dfrag);
cleaned = true;
}
@@ -1056,8 +1081,14 @@ static void __mptcp_clean_una(struct sock *sk)
if (dfrag && after64(snd_una, dfrag->data_seq)) {
u64 delta = snd_una - dfrag->data_seq;
- if (WARN_ON_ONCE(delta > dfrag->already_sent))
- goto out;
+ /* prevent wrap around in recovery mode */
+ if (unlikely(delta > dfrag->already_sent)) {
+ if (WARN_ON_ONCE(!msk->recovery))
+ goto out;
+ if (WARN_ON_ONCE(delta > dfrag->data_len))
+ goto out;
+ dfrag->already_sent += delta - dfrag->already_sent;
+ }
dfrag->data_seq += delta;
dfrag->offset += delta;
@@ -1068,16 +1099,16 @@ static void __mptcp_clean_una(struct sock *sk)
cleaned = true;
}
+ /* all retransmitted data acked, recovery completed */
+ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt))
+ msk->recovery = false;
+
out:
- if (cleaned) {
- if (tcp_under_memory_pressure(sk)) {
- __mptcp_update_wmem(sk);
- sk_mem_reclaim_partial(sk);
- }
- }
+ if (cleaned && tcp_under_memory_pressure(sk))
+ __mptcp_mem_reclaim_partial(sk);
- if (snd_una == READ_ONCE(msk->snd_nxt)) {
- if (msk->timer_ival && !mptcp_data_fin_enabled(msk))
+ if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) {
+ if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
mptcp_stop_timer(sk);
} else {
mptcp_reset_timer(sk);
@@ -1154,6 +1185,7 @@ struct mptcp_sendmsg_info {
u16 limit;
u16 sent;
unsigned int flags;
+ bool data_lock_held;
};
static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
@@ -1225,17 +1257,17 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
return false;
}
-static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk)
+static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
{
- return !ssk->sk_tx_skb_cache &&
- tcp_under_memory_pressure(sk);
-}
+ gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
-static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk)
-{
- if (unlikely(mptcp_must_reclaim_memory(sk, ssk)))
- mptcp_mem_reclaim_partial(sk);
- return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation);
+ if (unlikely(tcp_under_memory_pressure(sk))) {
+ if (data_lock_held)
+ __mptcp_mem_reclaim_partial(sk);
+ else
+ mptcp_mem_reclaim_partial(sk);
+ }
+ return __mptcp_alloc_tx_skb(sk, ssk, gfp);
}
/* note: this always recompute the csum on the whole skb, even
@@ -1259,7 +1291,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
bool zero_window_probe = false;
struct mptcp_ext *mpext = NULL;
struct sk_buff *skb, *tail;
- bool can_collapse = false;
+ bool must_collapse = false;
int size_bias = 0;
int avail_size;
size_t ret = 0;
@@ -1279,16 +1311,24 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* SSN association set here
*/
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
- can_collapse = (info->size_goal - skb->len > 0) &&
- mptcp_skb_can_collapse_to(data_seq, skb, mpext);
- if (!can_collapse) {
+ if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
TCP_SKB_CB(skb)->eor = 1;
- } else {
+ goto alloc_skb;
+ }
+
+ must_collapse = (info->size_goal - skb->len > 0) &&
+ (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags);
+ if (must_collapse) {
size_bias = skb->len;
avail_size = info->size_goal - skb->len;
}
}
+alloc_skb:
+ if (!must_collapse && !ssk->sk_tx_skb_cache &&
+ !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held))
+ return 0;
+
/* Zero window and all data acked? Probe. */
avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
if (avail_size == 0) {
@@ -1318,7 +1358,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
if (skb == tail) {
TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += ret;
- WARN_ON_ONCE(!can_collapse);
WARN_ON_ONCE(zero_window_probe);
goto out;
}
@@ -1366,16 +1405,44 @@ struct subflow_send_info {
u64 ratio;
};
+void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
+{
+ if (!subflow->stale)
+ return;
+
+ subflow->stale = 0;
+ MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER);
+}
+
+bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ if (unlikely(subflow->stale)) {
+ u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp);
+
+ if (subflow->stale_rcv_tstamp == rcv_tstamp)
+ return false;
+
+ mptcp_subflow_set_active(subflow);
+ }
+ return __mptcp_subflow_active(subflow);
+}
+
+/* implement the mptcp packet scheduler;
+ * returns the subflow that will transmit the next DSS
+ * additionally updates the rtx timeout
+ */
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
int i, nr_active = 0;
struct sock *ssk;
+ long tout = 0;
u64 ratio;
u32 pace;
- sock_owned_by_me((struct sock *)msk);
+ sock_owned_by_me(sk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
@@ -1386,8 +1453,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
+ mptcp_set_timeout(sk);
return msk->last_snd;
+ }
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) {
@@ -1400,6 +1469,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (!mptcp_subflow_active(subflow))
continue;
+ tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup;
if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
continue;
@@ -1415,6 +1485,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
send_info[subflow->backup].ratio = ratio;
}
}
+ __mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */
if (!nr_active)
@@ -1433,12 +1504,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
static void mptcp_push_release(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info)
{
- mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
release_sock(ssk);
}
-static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+void __mptcp_push_pending(struct sock *sk, unsigned int flags)
{
struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1459,25 +1529,20 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
- /* try to keep the subflow socket lock across
- * consecutive xmit on the same socket
+ /* First check. If the ssk has changed since
+ * the last round, release prev_ssk
*/
if (ssk != prev_ssk && prev_ssk)
mptcp_push_release(sk, prev_ssk, &info);
if (!ssk)
goto out;
- if (ssk != prev_ssk || !prev_ssk)
- lock_sock(ssk);
-
- /* keep it simple and always provide a new skb for the
- * subflow, even if we will not use it when collapsing
- * on the pending one
+ /* Need to lock the new subflow only if different
+ * from the previous one, otherwise we are still
+ * helding the relevant lock
*/
- if (!mptcp_alloc_tx_skb(sk, ssk)) {
- mptcp_push_release(sk, ssk, &info);
- goto out;
- }
+ if (ssk != prev_ssk)
+ lock_sock(ssk);
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0) {
@@ -1501,18 +1566,19 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
mptcp_push_release(sk, ssk, &info);
out:
- if (copied) {
- /* start the timer, if it's not pending */
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
+ /* ensure the rtx timer is running */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+ if (copied)
__mptcp_check_send_data_fin(sk);
- }
}
static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_sendmsg_info info;
+ struct mptcp_sendmsg_info info = {
+ .data_lock_held = true,
+ };
struct mptcp_data_frag *dfrag;
struct sock *xmit_ssk;
int len, copied = 0;
@@ -1538,13 +1604,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
goto out;
}
- if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
- __mptcp_update_wmem(sk);
- sk_mem_reclaim_partial(sk);
- }
- if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC))
- goto out;
-
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0)
goto out;
@@ -1567,7 +1626,6 @@ out:
*/
__mptcp_update_wmem(sk);
if (copied) {
- mptcp_set_timeout(sk, ssk);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
if (!mptcp_timer_pending(sk))
@@ -2083,10 +2141,11 @@ static void mptcp_timeout_timer(struct timer_list *t)
*
* A backup subflow is returned only if that is the only kind available.
*/
-static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
+static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
+ struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow;
- struct sock *backup = NULL;
+ int min_stale_count = INT_MAX;
sock_owned_by_me((const struct sock *)msk);
@@ -2096,14 +2155,14 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- if (!mptcp_subflow_active(subflow))
+ if (!__mptcp_subflow_active(subflow))
continue;
- /* still data outstanding at TCP level? Don't retransmit. */
- if (!tcp_write_queue_empty(ssk)) {
- if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss)
- continue;
- return NULL;
+ /* still data outstanding at TCP level? skip this */
+ if (!tcp_rtx_and_write_queues_empty(ssk)) {
+ mptcp_pm_subflow_chk_stale(msk, ssk);
+ min_stale_count = min_t(int, min_stale_count, subflow->stale_count);
+ continue;
}
if (subflow->backup) {
@@ -2112,10 +2171,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
continue;
}
- return ssk;
+ if (!pick)
+ pick = ssk;
}
- return backup;
+ if (pick)
+ return pick;
+
+ /* use backup only if there are no progresses anywhere */
+ return min_stale_count > 1 ? backup : NULL;
}
static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
@@ -2126,6 +2190,50 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk)
}
}
+bool __mptcp_retransmit_pending_data(struct sock *sk)
+{
+ struct mptcp_data_frag *cur, *rtx_head;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (__mptcp_check_fallback(mptcp_sk(sk)))
+ return false;
+
+ if (tcp_rtx_and_write_queues_empty(sk))
+ return false;
+
+ /* the closing socket has some data untransmitted and/or unacked:
+ * some data in the mptcp rtx queue has not really xmitted yet.
+ * keep it simple and re-inject the whole mptcp level rtx queue
+ */
+ mptcp_data_lock(sk);
+ __mptcp_clean_una_wakeup(sk);
+ rtx_head = mptcp_rtx_head(sk);
+ if (!rtx_head) {
+ mptcp_data_unlock(sk);
+ return false;
+ }
+
+ /* will accept ack for reijected data before re-sending them */
+ if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
+ msk->recovery_snd_nxt = msk->snd_nxt;
+ msk->recovery = true;
+ mptcp_data_unlock(sk);
+
+ msk->first_pending = rtx_head;
+ msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
+ msk->snd_nxt = rtx_head->data_seq;
+ msk->snd_burst = 0;
+
+ /* be sure to clear the "sent status" on all re-injected fragments */
+ list_for_each_entry(cur, &msk->rtx_queue, list) {
+ if (!cur->already_sent)
+ break;
+ cur->already_sent = 0;
+ }
+
+ return true;
+}
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -2138,6 +2246,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ bool need_push;
list_del(&subflow->node);
@@ -2149,6 +2258,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (ssk->sk_socket)
sock_orphan(ssk);
+ need_push = __mptcp_retransmit_pending_data(sk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2176,6 +2286,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
if (msk->subflow && ssk == msk->subflow->sk)
mptcp_dispose_initial_subflow(msk);
+
+ if (need_push)
+ __mptcp_push_pending(sk, 0);
}
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
@@ -2296,9 +2409,6 @@ static void __mptcp_retrans(struct sock *sk)
info.sent = 0;
info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
while (info.sent < info.limit) {
- if (!mptcp_alloc_tx_skb(sk, ssk))
- break;
-
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0)
break;
@@ -2313,7 +2423,6 @@ static void __mptcp_retrans(struct sock *sk)
info.size_goal);
}
- mptcp_set_timeout(sk, ssk);
release_sock(ssk);
reset_timer:
@@ -2384,10 +2493,12 @@ static int __mptcp_init_sock(struct sock *sk)
msk->wmem_reserved = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->tx_pending_data = 0;
+ msk->timer_ival = TCP_RTO_MIN;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ msk->recovery = false;
mptcp_pm_data_init(msk);
@@ -2472,7 +2583,6 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
tcp_shutdown(ssk, how);
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
- mptcp_set_timeout(sk, ssk);
tcp_send_ack(ssk);
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
@@ -2723,7 +2833,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->token = subflow_req->token;
msk->subflow = NULL;
WRITE_ONCE(msk->fully_established, false);
- if (mp_opt->csum_reqd)
+ if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
WRITE_ONCE(msk->csum_enabled, true);
msk->write_seq = subflow_req->idsn + 1;
@@ -2732,7 +2842,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
- if (mp_opt->mp_capable) {
+ if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) {
msk->can_ack = true;
msk->remote_key = mp_opt->sndr_key;
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);