aboutsummaryrefslogtreecommitdiffstats
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--net/mptcp/protocol.c1045
1 files changed, 695 insertions, 350 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c82a76d2d0bf..b6dc6e260334 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -22,6 +22,7 @@
#endif
#include <net/mptcp.h>
#include <net/xfrm.h>
+#include <asm/ioctls.h>
#include "protocol.h"
#include "mib.h"
@@ -46,9 +47,10 @@ struct mptcp_skb_cb {
enum {
MPTCP_CMSG_TS = BIT(0),
+ MPTCP_CMSG_INQ = BIT(1),
};
-static struct percpu_counter mptcp_sockets_allocated;
+static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);
@@ -115,6 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
subflow->request_mptcp = 1;
+
+ /* This is the first subflow, always with id 0 */
+ subflow->local_id_valid = 1;
mptcp_sock_graft(msk->first, sk->sk_socket);
return 0;
@@ -145,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
to->len, MPTCP_SKB_CB(from)->end_seq);
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
- kfree_skb_partial(from, fragstolen);
+
+ /* note the fwd memory can reach a negative value after accounting
+ * for the delta, but the later skb free will restore a non
+ * negative one
+ */
atomic_add(delta, &sk->sk_rmem_alloc);
mptcp_rmem_charge(sk, delta);
+ kfree_skb_partial(from, fragstolen);
+
return true;
}
@@ -162,8 +173,8 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
- mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ amount >>= PAGE_SHIFT;
+ mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
@@ -176,8 +187,8 @@ static void mptcp_rmem_uncharge(struct sock *sk, int size)
reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
/* see sk_mem_uncharge() for the rationale behind the following schema */
- if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
- __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK);
+ if (unlikely(reclaimable >= PAGE_SIZE))
+ __mptcp_rmem_reclaim(sk, reclaimable);
}
static void mptcp_rfree(struct sk_buff *skb)
@@ -211,7 +222,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
seq = MPTCP_SKB_CB(skb)->map_seq;
end_seq = MPTCP_SKB_CB(skb)->end_seq;
- max_seq = READ_ONCE(msk->rcv_wnd_sent);
+ max_seq = atomic64_read(&msk->rcv_wnd_sent);
pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
RB_EMPTY_ROOT(&msk->out_of_order_queue));
@@ -220,7 +231,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
mptcp_drop(sk, skb);
pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
(unsigned long long)end_seq - (unsigned long)max_seq,
- (unsigned long long)msk->rcv_wnd_sent);
+ (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
return;
}
@@ -318,20 +329,16 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
struct mptcp_sock *msk = mptcp_sk(sk);
int amt, amount;
- if (size < msk->rmem_fwd_alloc)
+ if (size <= msk->rmem_fwd_alloc)
return true;
+ size -= msk->rmem_fwd_alloc;
amt = sk_mem_pages(size);
- amount = amt << SK_MEM_QUANTUM_SHIFT;
- msk->rmem_fwd_alloc += amount;
- if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
- if (ssk->sk_forward_alloc < amount) {
- msk->rmem_fwd_alloc -= amount;
- return false;
- }
+ amount = amt << PAGE_SHIFT;
+ if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
+ return false;
- ssk->sk_forward_alloc -= amount;
- }
+ msk->rmem_fwd_alloc += amount;
return true;
}
@@ -464,9 +471,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
static void mptcp_set_datafin_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 retransmits;
+
+ retransmits = min_t(u32, icsk->icsk_retransmits,
+ ilog2(TCP_RTO_MAX / TCP_RTO_MIN));
- mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX,
- TCP_RTO_MIN << icsk->icsk_retransmits);
+ mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits;
}
static void __mptcp_set_timeout(struct sock *sk, long tout)
@@ -492,19 +502,24 @@ static void mptcp_set_timeout(struct sock *sk)
__mptcp_set_timeout(sk, tout);
}
-static bool tcp_can_send_ack(const struct sock *ssk)
+static inline bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
}
-void mptcp_subflow_send_ack(struct sock *ssk)
+void __mptcp_subflow_send_ack(struct sock *ssk)
+{
+ if (tcp_can_send_ack(ssk))
+ tcp_send_ack(ssk);
+}
+
+static void mptcp_subflow_send_ack(struct sock *ssk)
{
bool slow;
slow = lock_sock_fast(ssk);
- if (tcp_can_send_ack(ssk))
- tcp_send_ack(ssk);
+ __mptcp_subflow_send_ack(ssk);
unlock_sock_fast(ssk, slow);
}
@@ -647,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
skb = skb_peek(&ssk->sk_receive_queue);
if (!skb) {
- /* if no data is found, a racing workqueue/recvmsg
- * already processed the new data, stop here or we
- * can enter an infinite loop
+ /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
+ * a different CPU can have already processed the pending
+ * data, stop here or we can enter an infinite loop
*/
if (!moved)
done = true;
@@ -657,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
}
if (__mptcp_check_fallback(msk)) {
- /* if we are running under the workqueue, TCP could have
- * collapsed skbs between dummy map creation and now
- * be sure to adjust the size
+ /* Under fallback skbs have no MPTCP extension and TCP could
+ * collapse them between the dummy map creation and the
+ * current dequeue. Be sure to adjust the map size.
*/
map_remaining = skb->len;
subflow->map_data_len = skb->len;
@@ -738,6 +753,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
delta);
MPTCP_SKB_CB(skb)->offset += delta;
+ MPTCP_SKB_CB(skb)->map_seq += delta;
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
msk->ack_seq = end_seq;
@@ -760,7 +776,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
if (!sock_owned_by_user(sk))
__mptcp_error_report(sk);
else
- set_bit(MPTCP_ERROR_REPORT, &msk->flags);
+ __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
}
/* If the moves have caught up with the DATA_FIN sequence number
@@ -805,47 +821,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
mptcp_data_unlock(sk);
}
-static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
+static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
{
- struct mptcp_subflow_context *subflow;
- bool ret = false;
+ struct sock *sk = (struct sock *)msk;
- if (likely(list_empty(&msk->join_list)))
+ if (sk->sk_state != TCP_ESTABLISHED)
return false;
- spin_lock_bh(&msk->join_list_lock);
- list_for_each_entry(subflow, &msk->join_list, node) {
- u32 sseq = READ_ONCE(subflow->setsockopt_seq);
-
- mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
- if (READ_ONCE(msk->setsockopt_seq) != sseq)
- ret = true;
- }
- list_splice_tail_init(&msk->join_list, &msk->conn_list);
- spin_unlock_bh(&msk->join_list_lock);
-
- return ret;
-}
-
-void __mptcp_flush_join_list(struct mptcp_sock *msk)
-{
- if (likely(!mptcp_do_flush_join_list(msk)))
- return;
+ /* attach to msk socket only after we are sure we will deal with it
+ * at close time
+ */
+ if (sk->sk_socket && !ssk->sk_socket)
+ mptcp_sock_graft(ssk, sk->sk_socket);
- if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
- mptcp_schedule_work((struct sock *)msk);
+ mptcp_propagate_sndbuf((struct sock *)msk, ssk);
+ mptcp_sockopt_sync_locked(msk, ssk);
+ return true;
}
-static void mptcp_flush_join_list(struct mptcp_sock *msk)
+static void __mptcp_flush_join_list(struct sock *sk)
{
- bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags);
-
- might_sleep();
+ struct mptcp_subflow_context *tmp, *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
- if (!mptcp_do_flush_join_list(msk) && !sync_needed)
- return;
+ list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ bool slow = lock_sock_fast(ssk);
- mptcp_sockopt_sync_all(msk);
+ list_move_tail(&subflow->node, &msk->conn_list);
+ if (!__mptcp_finish_join(msk, ssk))
+ mptcp_subflow_reset(ssk);
+ unlock_sock_fast(ssk, slow);
+ }
}
static bool mptcp_timer_pending(struct sock *sk)
@@ -966,23 +973,6 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
-static void __mptcp_mem_reclaim_partial(struct sock *sk)
-{
- int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
-
- lockdep_assert_held_once(&sk->sk_lock.slock);
-
- __mptcp_rmem_reclaim(sk, reclaimable - 1);
- sk_mem_reclaim_partial(sk);
-}
-
-static void mptcp_mem_reclaim_partial(struct sock *sk)
-{
- mptcp_data_lock(sk);
- __mptcp_mem_reclaim_partial(sk);
- mptcp_data_unlock(sk);
-}
-
static void dfrag_uncharge(struct sock *sk, int len)
{
sk_mem_uncharge(sk, len);
@@ -1002,7 +992,6 @@ static void __mptcp_clean_una(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- bool cleaned = false;
u64 snd_una;
/* on fallback we just need to ignore snd_una, as this is really
@@ -1025,7 +1014,6 @@ static void __mptcp_clean_una(struct sock *sk)
}
dfrag_clear(sk, dfrag);
- cleaned = true;
}
dfrag = mptcp_rtx_head(sk);
@@ -1047,7 +1035,6 @@ static void __mptcp_clean_una(struct sock *sk)
dfrag->already_sent -= delta;
dfrag_uncharge(sk, delta);
- cleaned = true;
}
/* all retransmitted data acked, recovery completed */
@@ -1055,9 +1042,6 @@ static void __mptcp_clean_una(struct sock *sk)
msk->recovery = false;
out:
- if (cleaned && tcp_under_memory_pressure(sk))
- __mptcp_mem_reclaim_partial(sk);
-
if (snd_una == READ_ONCE(msk->snd_nxt) &&
snd_una == READ_ONCE(msk->write_seq)) {
if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
@@ -1139,18 +1123,21 @@ struct mptcp_sendmsg_info {
bool data_lock_held;
};
-static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq,
- int avail_size)
+static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
+ u64 data_seq, int avail_size)
{
u64 window_end = mptcp_wnd_end(msk);
+ u64 mptcp_snd_wnd;
if (__mptcp_check_fallback(msk))
return avail_size;
- if (!before64(data_seq + avail_size, window_end)) {
- u64 allowed_size = window_end - data_seq;
+ mptcp_snd_wnd = window_end - data_seq;
+ avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
- return min_t(unsigned int, allowed_size, avail_size);
+ if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
+ tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED);
}
return avail_size;
@@ -1197,6 +1184,7 @@ static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, g
tcp_skb_entail(ssk, skb);
return skb;
}
+ tcp_skb_tsorted_anchor_cleanup(skb);
kfree_skb(skb);
return NULL;
}
@@ -1205,12 +1193,6 @@ static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, boo
{
gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
- if (unlikely(tcp_under_memory_pressure(sk))) {
- if (data_lock_held)
- __mptcp_mem_reclaim_partial(sk);
- else
- mptcp_mem_reclaim_partial(sk);
- }
return __mptcp_alloc_tx_skb(sk, ssk, gfp);
}
@@ -1226,6 +1208,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added)
mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
}
+static void mptcp_update_infinite_map(struct mptcp_sock *msk,
+ struct sock *ssk,
+ struct mptcp_ext *mpext)
+{
+ if (!mpext)
+ return;
+
+ mpext->infinite_map = 1;
+ mpext->data_len = 0;
+
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
+ mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
+ pr_fallback(msk);
+ mptcp_do_fallback(ssk);
+}
+
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_data_frag *dfrag,
struct mptcp_sendmsg_info *info)
@@ -1248,6 +1246,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
info->limit > dfrag->data_len))
return 0;
+ if (unlikely(!__tcp_can_send(ssk)))
+ return -EAGAIN;
+
/* compute send limit */
info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
copy = info->size_goal;
@@ -1268,7 +1269,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
tcp_mark_push(tcp_sk(ssk), skb);
goto alloc_skb;
}
@@ -1286,7 +1287,7 @@ alloc_skb:
}
/* Zero window and all data acked? Probe. */
- copy = mptcp_check_allowed_size(msk, data_seq, copy);
+ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy);
if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
@@ -1357,6 +1358,9 @@ alloc_skb:
out:
if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(skb, copy);
+ if (mptcp_subflow_ctx(ssk)->send_infinite_map)
+ mptcp_update_infinite_map(msk, ssk, mpext);
+ trace_mptcp_sendmsg_frag(mpext);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
return copy;
}
@@ -1369,7 +1373,7 @@ out:
struct subflow_send_info {
struct sock *ssk;
- u64 ratio;
+ u64 linger_time;
};
void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
@@ -1394,27 +1398,32 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
return __mptcp_subflow_active(subflow);
}
+#define SSK_MODE_ACTIVE 0
+#define SSK_MODE_BACKUP 1
+#define SSK_MODE_MAX 2
+
/* implement the mptcp packet scheduler;
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
- struct subflow_send_info send_info[2];
+ struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
+ u32 pace, burst, wmem;
int i, nr_active = 0;
struct sock *ssk;
+ u64 linger_time;
long tout = 0;
- u64 ratio;
- u32 pace;
sock_owned_by_me(sk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
- return sk_stream_memory_free(msk->first) ? msk->first : NULL;
+ return __tcp_can_send(msk->first) &&
+ sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
/* re-use last subflow, if the burst allow that */
@@ -1426,10 +1435,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
/* pick the subflow with the lower wmem/wspace ratio */
- for (i = 0; i < 2; ++i) {
+ for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL;
- send_info[i].ratio = -1;
+ send_info[i].linger_time = -1;
}
+
mptcp_for_each_subflow(msk, subflow) {
trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
@@ -1438,34 +1448,56 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
tout = max(tout, mptcp_timeout_from_subflow(subflow));
nr_active += !subflow->backup;
- if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
- continue;
-
- pace = READ_ONCE(ssk->sk_pacing_rate);
- if (!pace)
- continue;
+ pace = subflow->avg_pacing_rate;
+ if (unlikely(!pace)) {
+ /* init pacing rate from socket */
+ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
+ pace = subflow->avg_pacing_rate;
+ if (!pace)
+ continue;
+ }
- ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
- pace);
- if (ratio < send_info[subflow->backup].ratio) {
+ linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
+ if (linger_time < send_info[subflow->backup].linger_time) {
send_info[subflow->backup].ssk = ssk;
- send_info[subflow->backup].ratio = ratio;
+ send_info[subflow->backup].linger_time = linger_time;
}
}
__mptcp_set_timeout(sk, tout);
/* pick the best backup if no other subflow is active */
if (!nr_active)
- send_info[0].ssk = send_info[1].ssk;
+ send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;
+
+ /* According to the blest algorithm, to avoid HoL blocking for the
+ * faster flow, we need to:
+ * - estimate the faster flow linger time
+ * - use the above to estimate the amount of byte transferred
+ * by the faster flow
+ * - check that the amount of queued data is greter than the above,
+ * otherwise do not use the picked, slower, subflow
+ * We select the subflow with the shorter estimated time to flush
+ * the queued mem, which basically ensure the above. We just need
+ * to check that subflow has a non empty cwin.
+ */
+ ssk = send_info[SSK_MODE_ACTIVE].ssk;
+ if (!ssk || !sk_stream_memory_free(ssk))
+ return NULL;
- if (send_info[0].ssk) {
- msk->last_snd = send_info[0].ssk;
- msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
- tcp_sk(msk->last_snd)->snd_wnd);
- return msk->last_snd;
+ burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+ wmem = READ_ONCE(ssk->sk_wmem_queued);
+ if (!burst) {
+ msk->last_snd = NULL;
+ return ssk;
}
- return NULL;
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
+ READ_ONCE(ssk->sk_pacing_rate) * burst,
+ burst + wmem);
+ msk->last_snd = ssk;
+ msk->snd_burst = burst;
+ return ssk;
}
static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
@@ -1499,11 +1531,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
msk->snd_nxt = snd_nxt_new;
}
-static void mptcp_check_and_set_pending(struct sock *sk)
+void mptcp_check_and_set_pending(struct sock *sk)
{
- if (mptcp_send_head(sk) &&
- !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags))
- set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ if (mptcp_send_head(sk))
+ mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
}
void __mptcp_push_pending(struct sock *sk, unsigned int flags)
@@ -1513,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
struct mptcp_sendmsg_info info = {
.flags = flags,
};
+ bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag;
- int len, copied = 0;
+ int len;
while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent;
@@ -1524,7 +1556,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
int ret = 0;
prev_ssk = ssk;
- mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
/* First check. If the ssk has changed since
@@ -1544,12 +1575,14 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0) {
+ if (ret == -EAGAIN)
+ continue;
mptcp_push_release(ssk, &info);
goto out;
}
+ do_check_data_fin = true;
info.sent += ret;
- copied += ret;
len -= ret;
mptcp_update_post_push(msk, dfrag, ret);
@@ -1565,7 +1598,7 @@ out:
/* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
- if (copied)
+ if (do_check_data_fin)
__mptcp_check_send_data_fin(sk);
}
@@ -1640,10 +1673,42 @@ static void mptcp_set_nospace(struct sock *sk)
set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
}
+static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg,
+ size_t len, int *copied_syn)
+{
+ unsigned int saved_flags = msg->msg_flags;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret;
+
+ lock_sock(ssk);
+ msg->msg_flags |= MSG_DONTWAIT;
+ msk->connect_flags = O_NONBLOCK;
+ msk->is_sendmsg = 1;
+ ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
+ msk->is_sendmsg = 0;
+ msg->msg_flags = saved_flags;
+ release_sock(ssk);
+
+ /* do the blocking bits of inet_stream_connect outside the ssk socket lock */
+ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) {
+ ret = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, msg->msg_flags, 1);
+
+ /* Keep the same behaviour of plain TCP: zero the copied bytes in
+ * case of any error, except timeout or signal
+ */
+ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
+ *copied_syn = 0;
+ }
+
+ return ret;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct page_frag *pfrag;
+ struct socket *ssock;
size_t copied = 0;
int ret = 0;
long timeo;
@@ -1657,14 +1722,30 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) {
+ int copied_syn = 0;
+
+ ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn);
+ copied += copied_syn;
+ if (ret == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (ret)
+ goto do_error;
+ }
+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
ret = sk_stream_wait_connect(sk, &timeo);
if (ret)
- goto out;
+ goto do_error;
}
+ ret = -EPIPE;
+ if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
+ goto do_error;
+
pfrag = sk_page_frag(sk);
while (msg_data_left(msg)) {
@@ -1673,11 +1754,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
bool dfrag_collapsed;
size_t psize, offset;
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
- ret = -EPIPE;
- goto out;
- }
-
/* reuse tail pfrag, if possible, or carve a new one from the
* page allocator
*/
@@ -1709,7 +1785,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
ret = -EFAULT;
- goto out;
+ goto do_error;
}
/* data successfully copied into the write queue */
@@ -1741,7 +1817,7 @@ wait_for_memory:
__mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
- goto out;
+ goto do_error;
}
if (copied)
@@ -1749,7 +1825,14 @@ wait_for_memory:
out:
release_sock(sk);
- return copied ? : ret;
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+
+ copied = sk_stream_error(sk, msg->msg_flags, ret);
+ goto out;
}
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
@@ -1784,8 +1867,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
copied += count;
if (count < data_len) {
- if (!(flags & MSG_PEEK))
+ if (!(flags & MSG_PEEK)) {
MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ }
break;
}
@@ -1851,7 +1936,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
@@ -1869,7 +1954,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(rcvwin, advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
@@ -1927,7 +2012,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool ret, done;
- mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;
@@ -1965,8 +2049,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
return !skb_queue_empty(&msk->receive_queue);
}
+static unsigned int mptcp_inq_hint(const struct sock *sk)
+{
+ const struct mptcp_sock *msk = mptcp_sk(sk);
+ const struct sk_buff *skb;
+
+ skb = skb_peek(&msk->receive_queue);
+ if (skb) {
+ u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;
+
+ if (hint_val >= INT_MAX)
+ return INT_MAX;
+
+ return (unsigned int)hint_val;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+ return 1;
+
+ return 0;
+}
+
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct scm_timestamping_internal tss;
@@ -1984,11 +2089,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out_err;
}
- timeo = sock_rcvtimeo(sk, nonblock);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ if (unlikely(msk->recvmsg_inq))
+ cmsg_flags = MPTCP_CMSG_INQ;
+
while (copied < len) {
int bytes_read;
@@ -2062,6 +2170,12 @@ out_err:
if (cmsg_flags && copied >= 0) {
if (cmsg_flags & MPTCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
+
+ if (cmsg_flags & MPTCP_CMSG_INQ) {
+ unsigned int inq = mptcp_inq_hint(sk);
+
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ }
}
pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
@@ -2088,7 +2202,7 @@ static void mptcp_retransmit_timer(struct timer_list *t)
mptcp_schedule_work(sk);
} else {
/* delegate our work to tcp_release_cb() */
- set_bit(MPTCP_RETRANSMIT, &msk->flags);
+ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);
}
bh_unlock_sock(sk);
sock_put(sk);
@@ -2196,6 +2310,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
return true;
}
+/* flags for __mptcp_close_ssk() */
+#define MPTCP_CF_PUSH BIT(1)
+#define MPTCP_CF_FASTCLOSE BIT(2)
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -2205,22 +2323,43 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
* parent socket.
*/
static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
- struct mptcp_subflow_context *subflow)
+ struct mptcp_subflow_context *subflow,
+ unsigned int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- bool need_push;
+ bool need_push, dispose_it;
- list_del(&subflow->node);
+ dispose_it = !msk->subflow || ssk != msk->subflow->sk;
+ if (dispose_it)
+ list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
+ if (flags & MPTCP_CF_FASTCLOSE) {
+ /* be sure to force the tcp_disconnect() path,
+ * to generate the egress reset
+ */
+ ssk->sk_lingertime = 0;
+ sock_set_flag(ssk, SOCK_LINGER);
+ subflow->send_fastclose = 1;
+ }
+
+ need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
+ if (!dispose_it) {
+ tcp_disconnect(ssk, 0);
+ msk->subflow->state = SS_UNCONNECTED;
+ mptcp_subflow_ctx_reset(subflow);
+ release_sock(ssk);
+
+ goto out;
+ }
+
/* if we are invoked by the msk cleanup code, the subflow is
* already orphaned
*/
if (ssk->sk_socket)
sock_orphan(ssk);
- need_push = __mptcp_retransmit_pending_data(sk);
subflow->disposable = 1;
/* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
@@ -2231,6 +2370,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
kfree_rcu(subflow, rcu);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
+ if (ssk->sk_state == TCP_LISTEN) {
+ tcp_set_state(ssk, TCP_CLOSE);
+ mptcp_subflow_queue_clean(ssk);
+ inet_csk_listen_stop(ssk);
+ }
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@@ -2240,14 +2384,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
sock_put(ssk);
- if (ssk == msk->last_snd)
- msk->last_snd = NULL;
-
if (ssk == msk->first)
msk->first = NULL;
- if (msk->subflow && ssk == msk->subflow->sk)
- mptcp_dispose_initial_subflow(msk);
+out:
+ if (ssk == msk->last_snd)
+ msk->last_snd = NULL;
if (need_push)
__mptcp_push_pending(sk, 0);
@@ -2258,7 +2400,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
{
if (sk->sk_state == TCP_ESTABLISHED)
mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);
- __mptcp_close_ssk(sk, ssk, subflow);
+
+ /* subflow aborted before reaching the fully_established status
+ * attempt the creation of the next subflow
+ */
+ mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow);
+
+ __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
}
static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
@@ -2272,7 +2420,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk)
might_sleep();
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (inet_sk_state_load(ssk) != TCP_CLOSE)
@@ -2315,7 +2463,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
mptcp_token_destroy(msk);
- list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
bool slow;
@@ -2327,12 +2475,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
unlock_sock_fast(tcp_sk, slow);
}
+ /* Mirror the tcp_reset() error propagation */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
+ }
+
inet_sk_state_store(sk, TCP_CLOSE);
sk->sk_shutdown = SHUTDOWN_MASK;
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
- mptcp_close_wake_up(sk);
+ /* the calling mptcp_worker will properly destroy the socket */
+ if (sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk->sk_state_change(sk);
+ sk_error_report(sk);
}
static void __mptcp_retrans(struct sock *sk)
@@ -2387,6 +2554,7 @@ static void __mptcp_retrans(struct sock *sk)
dfrag->already_sent = max(dfrag->already_sent, info.sent);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
}
release_sock(ssk);
@@ -2398,10 +2566,60 @@ reset_timer:
mptcp_reset_timer(sk);
}
+/* schedule the timeout timer for the relevant event: either close timeout
+ * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
+ */
+void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned long timeout, close_timeout;
+
+ if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
+ return;
+
+ close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
+
+ /* the close timeout takes precedence on the fail one, and here at least one of
+ * them is active
+ */
+ timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
+
+ sk_reset_timer(sk, &sk->sk_timer, timeout);
+}
+
+static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
+{
+ struct sock *ssk = msk->first;
+ bool slow;
+
+ if (!ssk)
+ return;
+
+ pr_debug("MP_FAIL doesn't respond, reset the subflow");
+
+ slow = lock_sock_fast(ssk);
+ mptcp_subflow_reset(ssk);
+ WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
+ unlock_sock_fast(ssk, slow);
+
+ mptcp_reset_timeout(msk, 0);
+}
+
+static void mptcp_do_fastclose(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
+ subflow, MPTCP_CF_FASTCLOSE);
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *sk = &msk->sk.icsk_inet.sk;
+ unsigned long fail_tout;
int state;
lock_sock(sk);
@@ -2410,12 +2628,10 @@ static void mptcp_worker(struct work_struct *work)
goto unlock;
mptcp_check_data_fin_ack(sk);
- mptcp_flush_join_list(msk);
mptcp_check_fastclose(msk);
- if (msk->pm.status)
- mptcp_pm_nl_work(msk);
+ mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
@@ -2427,11 +2643,15 @@ static void mptcp_worker(struct work_struct *work)
* closed, but we need the msk around to reply to incoming DATA_FIN,
* even if it is orphaned and in FIN_WAIT2 state
*/
- if (sock_flag(sk, SOCK_DEAD) &&
- (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) {
- inet_sk_state_store(sk, TCP_CLOSE);
- __mptcp_destroy_sock(sk);
- goto unlock;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ if (mptcp_check_close_timeout(sk)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ }
+ if (sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ goto unlock;
+ }
}
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@@ -2440,6 +2660,10 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
+ fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
+ if (fail_tout && time_after(jiffies, fail_tout))
+ mptcp_mp_fail_no_response(msk);
+
unlock:
release_sock(sk);
sock_put(sk);
@@ -2449,8 +2673,6 @@ static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- spin_lock_init(&msk->join_list_lock);
-
INIT_LIST_HEAD(&msk->conn_list);
INIT_LIST_HEAD(&msk->join_list);
INIT_LIST_HEAD(&msk->rtx_queue);
@@ -2465,6 +2687,7 @@ static int __mptcp_init_sock(struct sock *sk)
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ WRITE_ONCE(msk->allow_infinite_fallback, true);
msk->recovery = false;
mptcp_pm_data_init(msk);
@@ -2476,9 +2699,20 @@ static int __mptcp_init_sock(struct sock *sk)
return 0;
}
-static int mptcp_init_sock(struct sock *sk)
+static void mptcp_ca_reset(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_assign_congestion_control(sk);
+ strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
+
+ /* no need to keep a reference to the ops, the name will suffice */
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = NULL;
+}
+
+static int mptcp_init_sock(struct sock *sk)
+{
struct net *net = sock_net(sk);
int ret;
@@ -2499,16 +2733,11 @@ static int mptcp_init_sock(struct sock *sk)
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
* propagate the correct value
*/
- tcp_assign_congestion_control(sk);
- strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name);
-
- /* no need to keep a reference to the ops, the name will suffice */
- tcp_cleanup_congestion_control(sk);
- icsk->icsk_ca_ops = NULL;
+ mptcp_ca_reset(sk);
sk_sockets_allocated_inc(sk);
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+ sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
return 0;
}
@@ -2523,7 +2752,7 @@ static void __mptcp_clear_xmit(struct sock *sk)
dfrag_clear(sk, dfrag);
}
-static void mptcp_cancel_work(struct sock *sk)
+void mptcp_cancel_work(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -2609,6 +2838,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
* state now
*/
if (__mptcp_check_fallback(msk)) {
+ WRITE_ONCE(msk->snd_una, msk->write_seq);
if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_close_wake_up(sk);
@@ -2617,7 +2847,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
}
}
- mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -2642,31 +2871,16 @@ static void __mptcp_wr_shutdown(struct sock *sk)
static void __mptcp_destroy_sock(struct sock *sk)
{
- struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
- LIST_HEAD(conn_list);
pr_debug("msk=%p", msk);
might_sleep();
- /* be sure to always acquire the join list lock, to sync vs
- * mptcp_finish_join().
- */
- spin_lock_bh(&msk->join_list_lock);
- list_splice_tail_init(&msk->join_list, &msk->conn_list);
- spin_unlock_bh(&msk->join_list_lock);
- list_splice_init(&msk->conn_list, &conn_list);
-
- sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+ mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0;
- list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- __mptcp_close_ssk(sk, ssk, subflow);
- }
-
sk->sk_prot->destroy(sk);
WARN_ON_ONCE(msk->rmem_fwd_alloc);
@@ -2675,16 +2889,27 @@ static void __mptcp_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
- mptcp_dispose_initial_subflow(msk);
sock_put(sk);
}
-static void mptcp_close(struct sock *sk, long timeout)
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+ /* Concurrent splices from sk_receive_queue into receive_queue will
+ * always show at least one non-empty queue when checked in this order.
+ */
+ if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
+ skb_queue_empty_lockless(&msk->receive_queue))
+ return 0;
+
+ return EPOLLIN | EPOLLRDNORM;
+}
+
+bool __mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
- lock_sock(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
@@ -2692,18 +2917,29 @@ static void mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_close_state(sk))
+ if (mptcp_check_readable(msk)) {
+ /* the msk has read data, do the MPTCP equivalent of TCP reset */
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_do_fastclose(sk);
+ } else if (mptcp_close_state(sk)) {
__mptcp_wr_shutdown(sk);
+ }
sk_stream_wait_close(sk, timeout);
cleanup:
/* orphan all the subflows */
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
- mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
+ mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ /* since the close timeout takes precedence on the fail one,
+ * cancel the latter
+ */
+ if (ssk == msk->first)
+ subflow->fail_tout = 0;
+
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);
}
@@ -2711,23 +2947,34 @@ cleanup:
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
+ if (mptcp_sk(sk)->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
+
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
- sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
+ mptcp_reset_timeout(msk, 0);
}
+
+ return do_cancel_work;
+}
+
+static void mptcp_close(struct sock *sk, long timeout)
+{
+ bool do_cancel_work;
+
+ lock_sock(sk);
+
+ do_cancel_work = __mptcp_close(sk, timeout);
release_sock(sk);
if (do_cancel_work)
mptcp_cancel_work(sk);
- if (mptcp_sk(sk)->token)
- mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
-
sock_put(sk);
}
-static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
@@ -2752,18 +2999,37 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
static int mptcp_disconnect(struct sock *sk, int flags)
{
- struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_do_flush_join_list(msk);
+ inet_sk_state_store(sk, TCP_CLOSE);
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ mptcp_stop_timer(sk);
+ sk_stop_timer(sk, &sk->sk_timer);
- lock_sock(ssk);
- tcp_disconnect(ssk, flags);
- release_sock(ssk);
- }
+ if (mptcp_sk(sk)->token)
+ mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
+
+ /* msk->subflow is still intact, the following will not free the first
+ * subflow
+ */
+ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
+ msk->last_snd = NULL;
+ WRITE_ONCE(msk->flags, 0);
+ msk->cb_flags = 0;
+ msk->push_pending = 0;
+ msk->recovery = false;
+ msk->can_ack = false;
+ msk->fully_established = false;
+ msk->rcv_data_fin = false;
+ msk->snd_data_fin_enable = false;
+ msk->rcv_fastclose = false;
+ msk->use_64bit_ack = false;
+ WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
+ mptcp_pm_data_reset(msk);
+ mptcp_ca_reset(sk);
+
+ sk->sk_shutdown = 0;
+ sk_error_report(sk);
return 0;
}
@@ -2815,7 +3081,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
ack_seq++;
WRITE_ONCE(msk->ack_seq, ack_seq);
- WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+ atomic64_set(&msk->rcv_wnd_sent, ack_seq);
}
sock_reset_flag(nsk, SOCK_RCU_FREE);
@@ -2879,7 +3145,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
*/
if (WARN_ON_ONCE(!new_mptcp_sock)) {
tcp_sk(newsk)->is_mptcp = 0;
- return newsk;
+ goto out;
}
/* acquire the 2nd reference for the owning socket */
@@ -2891,19 +3157,28 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
}
+out:
+ newsk->sk_kern_sock = kern;
return newsk;
}
-void mptcp_destroy_common(struct mptcp_sock *msk)
+void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
{
+ struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
__mptcp_clear_xmit(sk);
+ /* join list will be eventually flushed (with rst) at sock lock release time */
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
+
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
+ mptcp_data_lock(sk);
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue);
+ mptcp_data_unlock(sk);
/* move all the rx fwd alloc into the sk_mem_reclaim_final in
* inet_sock_destruct() will dispose it
@@ -2912,13 +3187,18 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
msk->rmem_fwd_alloc = 0;
mptcp_token_destroy(msk);
mptcp_pm_free_anno_list(msk);
+ mptcp_free_local_addr_list(msk);
}
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- mptcp_destroy_common(msk);
+ /* clears msk->subflow, allowing the following to close
+ * even the initial subflow
+ */
+ mptcp_dispose_initial_subflow(msk);
+ mptcp_destroy_common(msk, 0);
sk_sockets_allocated_dec(sk);
}
@@ -2927,7 +3207,7 @@ void __mptcp_data_acked(struct sock *sk)
if (!sock_owned_by_user(sk))
__mptcp_clean_una(sk);
else
- set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
if (mptcp_pending_data_fin_ack(sk))
mptcp_schedule_work(sk);
@@ -2946,20 +3226,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
else if (xmit_ssk)
mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND);
} else {
- set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
}
}
+#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
+ BIT(MPTCP_RETRANSMIT) | \
+ BIT(MPTCP_FLUSH_JOIN_LIST))
+
/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
+ __must_hold(&sk->sk_lock.slock)
{
- for (;;) {
- unsigned long flags = 0;
+ struct mptcp_sock *msk = mptcp_sk(sk);
- if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags))
- flags |= BIT(MPTCP_PUSH_PENDING);
- if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags))
- flags |= BIT(MPTCP_RETRANSMIT);
+ for (;;) {
+ unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
+ msk->push_pending;
if (!flags)
break;
@@ -2970,8 +3253,11 @@ static void mptcp_release_cb(struct sock *sk)
* datapath acquires the msk socket spinlock while helding
* the subflow socket lock
*/
-
+ msk->push_pending = 0;
+ msk->cb_flags &= ~flags;
spin_unlock_bh(&sk->sk_lock.slock);
+ if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
+ __mptcp_flush_join_list(sk);
if (flags & BIT(MPTCP_PUSH_PENDING))
__mptcp_push_pending(sk, 0);
if (flags & BIT(MPTCP_RETRANSMIT))
@@ -2981,15 +3267,19 @@ static void mptcp_release_cb(struct sock *sk)
spin_lock_bh(&sk->sk_lock.slock);
}
- /* be sure to set the current sk state before tacking actions
- * depending on sk_state
- */
- if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags))
- __mptcp_set_connected(sk);
- if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags))
+ if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
__mptcp_clean_una_wakeup(sk);
- if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
- __mptcp_error_report(sk);
+ if (unlikely(&msk->cb_flags)) {
+ /* be sure to set the current sk state before tacking actions
+ * depending on sk_state, that is processing MPTCP_ERROR_REPORT
+ */
+ if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
+ __mptcp_set_connected(sk);
+ if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
+ __mptcp_error_report(sk);
+ if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
+ msk->last_snd = NULL;
+ }
__mptcp_update_rmem(sk);
}
@@ -3030,7 +3320,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk);
else
- set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
mptcp_data_unlock(sk);
mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND);
}
@@ -3093,9 +3383,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
WRITE_ONCE(msk->ack_seq, ack_seq);
- WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
WRITE_ONCE(msk->snd_una, msk->write_seq);
+ atomic64_set(&msk->rcv_wnd_sent, ack_seq);
mptcp_pm_new_connection(msk, ssk, 0);
@@ -3116,8 +3406,7 @@ bool mptcp_finish_join(struct sock *ssk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *parent = (void *)msk;
- struct socket *parent_sock;
- bool ret;
+ bool ret = true;
pr_debug("msk=%p, subflow=%p", msk, subflow);
@@ -3127,38 +3416,39 @@ bool mptcp_finish_join(struct sock *ssk)
return false;
}
- if (!msk->pm.server_side)
+ if (!list_empty(&subflow->node))
goto out;
- if (!mptcp_pm_allow_new_subflow(msk)) {
- subflow->reset_reason = MPTCP_RST_EPROHIBIT;
- return false;
- }
+ if (!mptcp_pm_allow_new_subflow(msk))
+ goto err_prohibited;
- /* active connections are already on conn_list, and we can't acquire
- * msk lock here.
- * use the join list lock as synchronization point and double-check
- * msk status to avoid racing with __mptcp_destroy_sock()
+ /* active connections are already on conn_list.
+ * If we can't acquire msk socket lock here, let the release callback
+ * handle it
*/
- spin_lock_bh(&msk->join_list_lock);
- ret = inet_sk_state_load(parent) == TCP_ESTABLISHED;
- if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) {
- list_add_tail(&subflow->node, &msk->join_list);
+ mptcp_data_lock(parent);
+ if (!sock_owned_by_user(parent)) {
+ ret = __mptcp_finish_join(msk, ssk);
+ if (ret) {
+ sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->conn_list);
+ }
+ } else {
sock_hold(ssk);
+ list_add_tail(&subflow->node, &msk->join_list);
+ __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
}
- spin_unlock_bh(&msk->join_list_lock);
+ mptcp_data_unlock(parent);
+
if (!ret) {
+err_prohibited:
subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false;
}
- /* attach to msk socket only after we are sure he will deal with us
- * at close time
- */
- parent_sock = READ_ONCE(parent->sk_socket);
- if (parent_sock && !ssk->sk_socket)
- mptcp_sock_graft(ssk, parent_sock);
subflow->map_seq = READ_ONCE(msk->ack_seq);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+
out:
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
return true;
@@ -3177,10 +3467,135 @@ static int mptcp_forward_alloc_get(const struct sock *sk)
return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc;
}
+static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
+{
+ const struct sock *sk = (void *)msk;
+ u64 delta;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ return 0;
+
+ delta = msk->write_seq - v;
+ if (__mptcp_check_fallback(msk) && msk->first) {
+ struct tcp_sock *tp = tcp_sk(msk->first);
+
+ /* the first subflow is disconnected after close - see
+ * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq
+ * so ignore that status, too.
+ */
+ if (!((1 << msk->first->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)))
+ delta += READ_ONCE(tp->write_seq) - tp->snd_una;
+ }
+ if (delta > INT_MAX)
+ delta = INT_MAX;
+
+ return (int)delta;
+}
+
+static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ bool slow;
+ int answ;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ lock_sock(sk);
+ __mptcp_move_skbs(msk);
+ answ = mptcp_inq_hint(sk);
+ release_sock(sk);
+ break;
+ case SIOCOUTQ:
+ slow = lock_sock_fast(sk);
+ answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
+ unlock_sock_fast(sk, slow);
+ break;
+ case SIOCOUTQNSD:
+ slow = lock_sock_fast(sk);
+ answ = mptcp_ioctl_outq(msk, msk->snd_nxt);
+ unlock_sock_fast(sk, slow);
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(answ, (int __user *)arg);
+}
+
+static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ subflow->request_mptcp = 0;
+ __mptcp_do_fallback(msk);
+}
+
+static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ int err = -EINVAL;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock)
+ return -EINVAL;
+
+ mptcp_token_destroy(msk);
+ inet_sk_state_store(sk, TCP_SYN_SENT);
+ subflow = mptcp_subflow_ctx(ssock->sk);
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
+ mptcp_subflow_early_fallback(msk, subflow);
+#endif
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
+ MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
+ mptcp_subflow_early_fallback(msk, subflow);
+ }
+ if (likely(!__mptcp_check_fallback(msk)))
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
+
+ /* if reaching here via the fastopen/sendmsg path, the caller already
+ * acquired the subflow socket lock, too.
+ */
+ if (msk->is_sendmsg)
+ err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1);
+ else
+ err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags);
+ inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect;
+
+ /* on successful connect, the msk state will be moved to established by
+ * subflow_finish_connect()
+ */
+ if (unlikely(err && err != -EINPROGRESS)) {
+ inet_sk_state_store(sk, inet_sk_state_load(ssock->sk));
+ return err;
+ }
+
+ mptcp_copy_inaddrs(sk, ssock->sk);
+
+ /* unblocking connect, mptcp-level inet_stream_connect will error out
+ * without changing the socket state, update it here.
+ */
+ if (err == -EINPROGRESS)
+ sk->sk_socket->state = ssock->state;
+ return err;
+}
+
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
.init = mptcp_init_sock,
+ .connect = mptcp_connect,
.disconnect = mptcp_disconnect,
.close = mptcp_close,
.accept = mptcp_accept,
@@ -3189,6 +3604,7 @@ static struct proto mptcp_prot = {
.shutdown = mptcp_shutdown,
.destroy = mptcp_destroy,
.sendmsg = mptcp_sendmsg,
+ .ioctl = mptcp_ioctl,
.recvmsg = mptcp_recvmsg,
.release_cb = mptcp_release_cb,
.hash = mptcp_hash,
@@ -3196,7 +3612,10 @@ static struct proto mptcp_prot = {
.get_port = mptcp_get_port,
.forward_alloc_get = mptcp_forward_alloc_get,
.sockets_allocated = &mptcp_sockets_allocated,
+
.memory_allocated = &tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
@@ -3228,68 +3647,16 @@ unlock:
return err;
}
-static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
- struct mptcp_subflow_context *subflow)
-{
- subflow->request_mptcp = 0;
- __mptcp_do_fallback(msk);
-}
-
static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
- struct mptcp_sock *msk = mptcp_sk(sock->sk);
- struct mptcp_subflow_context *subflow;
- struct socket *ssock;
- int err;
+ int ret;
lock_sock(sock->sk);
- if (sock->state != SS_UNCONNECTED && msk->subflow) {
- /* pending connection or invalid state, let existing subflow
- * cope with that
- */
- ssock = msk->subflow;
- goto do_connect;
- }
-
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
- goto unlock;
- }
-
- mptcp_token_destroy(msk);
- inet_sk_state_store(sock->sk, TCP_SYN_SENT);
- subflow = mptcp_subflow_ctx(ssock->sk);
-#ifdef CONFIG_TCP_MD5SIG
- /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
- * TCP option space.
- */
- if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
- mptcp_subflow_early_fallback(msk, subflow);
-#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
- MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
- mptcp_subflow_early_fallback(msk, subflow);
- }
- if (likely(!__mptcp_check_fallback(msk)))
- MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE);
-
-do_connect:
- err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
- sock->state = ssock->state;
-
- /* on successful connect, the msk state will be moved to established by
- * subflow_finish_connect()
- */
- if (!err || err == -EINPROGRESS)
- mptcp_copy_inaddrs(sock->sk, ssock->sk);
- else
- inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
-
-unlock:
+ mptcp_sk(sock->sk)->connect_flags = flags;
+ ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
release_sock(sock->sk);
- return err;
+ return ret;
}
static int mptcp_listen(struct socket *sock, int backlog)
@@ -3330,17 +3697,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
pr_debug("msk=%p", msk);
- lock_sock(sock->sk);
- if (sock->sk->sk_state != TCP_LISTEN)
- goto unlock_fail;
-
ssock = __mptcp_nmpc_socket(msk);
if (!ssock)
- goto unlock_fail;
-
- clear_bit(MPTCP_DATA_READY, &msk->flags);
- sock_hold(ssock->sk);
- release_sock(sock->sk);
+ return -EINVAL;
err = ssock->ops->accept(sock, newsock, flags, kern);
if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) {
@@ -3363,14 +3722,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
if (mptcp_is_fully_established(newsk))
mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL);
- mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
mptcp_propagate_sndbuf(newsk, msk->first);
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
- mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -3380,26 +3737,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
release_sock(newsk);
}
- if (inet_csk_listen_poll(ssock->sk))
- set_bit(MPTCP_DATA_READY, &msk->flags);
- sock_put(ssock->sk);
return err;
-
-unlock_fail:
- release_sock(sock->sk);
- return -EINVAL;
-}
-
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
-{
- /* Concurrent splices from sk_receive_queue into receive_queue will
- * always show at least one non-empty queue when checked in this order.
- */
- if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
- skb_queue_empty_lockless(&msk->receive_queue))
- return 0;
-
- return EPOLLIN | EPOLLRDNORM;
}
static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
@@ -3433,19 +3771,26 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
state = inet_sk_state_load(sk);
pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
- if (state == TCP_LISTEN)
- return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0;
+ if (state == TCP_LISTEN) {
+ if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk))
+ return 0;
+
+ return inet_csk_listen_poll(msk->subflow->sk);
+ }
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
mask |= mptcp_check_readable(msk);
mask |= mptcp_check_writeable(msk);
+ } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ /* cf tcp_poll() note about TFO */
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
smp_rmb();
if (sk->sk_err)
mask |= EPOLLERR;
@@ -3530,8 +3875,8 @@ void __init mptcp_proto_init(void)
for_each_possible_cpu(cpu) {
delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
INIT_LIST_HEAD(&delegated->head);
- netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll,
- NAPI_POLL_WEIGHT);
+ netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+ mptcp_napi_poll);
napi_enable(&delegated->napi);
}