diff options
Diffstat (limited to '')
-rw-r--r-- | net/mptcp/protocol.c | 1045 |
1 files changed, 695 insertions, 350 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c82a76d2d0bf..b6dc6e260334 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -22,6 +22,7 @@ #endif #include <net/mptcp.h> #include <net/xfrm.h> +#include <asm/ioctls.h> #include "protocol.h" #include "mib.h" @@ -46,9 +47,10 @@ struct mptcp_skb_cb { enum { MPTCP_CMSG_TS = BIT(0), + MPTCP_CMSG_INQ = BIT(1), }; -static struct percpu_counter mptcp_sockets_allocated; +static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void __mptcp_check_send_data_fin(struct sock *sk); @@ -115,6 +117,9 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; + + /* This is the first subflow, always with id 0 */ + subflow->local_id_valid = 1; mptcp_sock_graft(msk->first, sk->sk_socket); return 0; @@ -145,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; - kfree_skb_partial(from, fragstolen); + + /* note the fwd memory can reach a negative value after accounting + * for the delta, but the later skb free will restore a non + * negative one + */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); + kfree_skb_partial(from, fragstolen); + return true; } @@ -162,8 +173,8 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, static void __mptcp_rmem_reclaim(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; - mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + amount >>= PAGE_SHIFT; + mptcp_sk(sk)->rmem_fwd_alloc -= amount << PAGE_SHIFT; __sk_mem_reduce_allocated(sk, amount); } @@ -176,8 +187,8 @@ static void mptcp_rmem_uncharge(struct sock *sk, int size) reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); /* see sk_mem_uncharge() for the rationale behind the following schema */ - if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) - __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK); + if (unlikely(reclaimable >= PAGE_SIZE)) + __mptcp_rmem_reclaim(sk, reclaimable); } static void mptcp_rfree(struct sk_buff *skb) @@ -211,7 +222,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - max_seq = READ_ONCE(msk->rcv_wnd_sent); + max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); @@ -220,7 +231,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, - (unsigned long long)msk->rcv_wnd_sent); + (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -318,20 +329,16 @@ static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) struct mptcp_sock *msk = mptcp_sk(sk); int amt, amount; - if (size < msk->rmem_fwd_alloc) + if (size <= msk->rmem_fwd_alloc) return true; + size -= msk->rmem_fwd_alloc; amt = sk_mem_pages(size); - amount = amt << SK_MEM_QUANTUM_SHIFT; - msk->rmem_fwd_alloc += amount; - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { - if (ssk->sk_forward_alloc < amount) { - msk->rmem_fwd_alloc -= amount; - return false; - } + amount = amt << PAGE_SHIFT; + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) + return false; - ssk->sk_forward_alloc -= amount; - } + msk->rmem_fwd_alloc += amount; return true; } @@ -464,9 +471,12 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) static void mptcp_set_datafin_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + u32 retransmits; + + retransmits = min_t(u32, icsk->icsk_retransmits, + ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); - mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX, - TCP_RTO_MIN << icsk->icsk_retransmits); + mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) @@ -492,19 +502,24 @@ static void mptcp_set_timeout(struct sock *sk) __mptcp_set_timeout(sk, tout); } -static bool tcp_can_send_ack(const struct sock *ssk) +static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } -void mptcp_subflow_send_ack(struct sock *ssk) +void __mptcp_subflow_send_ack(struct sock *ssk) +{ + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); +} + +static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); - if (tcp_can_send_ack(ssk)) - tcp_send_ack(ssk); + __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } @@ -647,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { - /* if no data is found, a racing workqueue/recvmsg - * already processed the new data, stop here or we - * can enter an infinite loop + /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), + * a different CPU can have already processed the pending + * data, stop here or we can enter an infinite loop */ if (!moved) done = true; @@ -657,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } if (__mptcp_check_fallback(msk)) { - /* if we are running under the workqueue, TCP could have - * collapsed skbs between dummy map creation and now - * be sure to adjust the size + /* Under fallback skbs have no MPTCP extension and TCP could + * collapse them between the dummy map creation and the + * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; @@ -738,6 +753,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; + MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->ack_seq = end_seq; @@ -760,7 +776,7 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else - set_bit(MPTCP_ERROR_REPORT, &msk->flags); + __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number @@ -805,47 +821,38 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) mptcp_data_unlock(sk); } -static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) +static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { - struct mptcp_subflow_context *subflow; - bool ret = false; + struct sock *sk = (struct sock *)msk; - if (likely(list_empty(&msk->join_list))) + if (sk->sk_state != TCP_ESTABLISHED) return false; - spin_lock_bh(&msk->join_list_lock); - list_for_each_entry(subflow, &msk->join_list, node) { - u32 sseq = READ_ONCE(subflow->setsockopt_seq); - - mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); - if (READ_ONCE(msk->setsockopt_seq) != sseq) - ret = true; - } - list_splice_tail_init(&msk->join_list, &msk->conn_list); - spin_unlock_bh(&msk->join_list_lock); - - return ret; -} - -void __mptcp_flush_join_list(struct mptcp_sock *msk) -{ - if (likely(!mptcp_do_flush_join_list(msk))) - return; + /* attach to msk socket only after we are sure we will deal with it + * at close time + */ + if (sk->sk_socket && !ssk->sk_socket) + mptcp_sock_graft(ssk, sk->sk_socket); - if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); + mptcp_propagate_sndbuf((struct sock *)msk, ssk); + mptcp_sockopt_sync_locked(msk, ssk); + return true; } -static void mptcp_flush_join_list(struct mptcp_sock *msk) +static void __mptcp_flush_join_list(struct sock *sk) { - bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); - - might_sleep(); + struct mptcp_subflow_context *tmp, *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); - if (!mptcp_do_flush_join_list(msk) && !sync_needed) - return; + list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); - mptcp_sockopt_sync_all(msk); + list_move_tail(&subflow->node, &msk->conn_list); + if (!__mptcp_finish_join(msk, ssk)) + mptcp_subflow_reset(ssk); + unlock_sock_fast(ssk, slow); + } } static bool mptcp_timer_pending(struct sock *sk) @@ -966,23 +973,6 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static void __mptcp_mem_reclaim_partial(struct sock *sk) -{ - int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk); - - lockdep_assert_held_once(&sk->sk_lock.slock); - - __mptcp_rmem_reclaim(sk, reclaimable - 1); - sk_mem_reclaim_partial(sk); -} - -static void mptcp_mem_reclaim_partial(struct sock *sk) -{ - mptcp_data_lock(sk); - __mptcp_mem_reclaim_partial(sk); - mptcp_data_unlock(sk); -} - static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -1002,7 +992,6 @@ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - bool cleaned = false; u64 snd_una; /* on fallback we just need to ignore snd_una, as this is really @@ -1025,7 +1014,6 @@ static void __mptcp_clean_una(struct sock *sk) } dfrag_clear(sk, dfrag); - cleaned = true; } dfrag = mptcp_rtx_head(sk); @@ -1047,7 +1035,6 @@ static void __mptcp_clean_una(struct sock *sk) dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); - cleaned = true; } /* all retransmitted data acked, recovery completed */ @@ -1055,9 +1042,6 @@ static void __mptcp_clean_una(struct sock *sk) msk->recovery = false; out: - if (cleaned && tcp_under_memory_pressure(sk)) - __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt) && snd_una == READ_ONCE(msk->write_seq)) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) @@ -1139,18 +1123,21 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, - int avail_size) +static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, + u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); + u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; - if (!before64(data_seq + avail_size, window_end)) { - u64 allowed_size = window_end - data_seq; + mptcp_snd_wnd = window_end - data_seq; + avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); - return min_t(unsigned int, allowed_size, avail_size); + if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { + tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; @@ -1197,6 +1184,7 @@ static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, g tcp_skb_entail(ssk, skb); return skb; } + tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } @@ -1205,12 +1193,6 @@ static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, boo { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; - if (unlikely(tcp_under_memory_pressure(sk))) { - if (data_lock_held) - __mptcp_mem_reclaim_partial(sk); - else - mptcp_mem_reclaim_partial(sk); - } return __mptcp_alloc_tx_skb(sk, ssk, gfp); } @@ -1226,6 +1208,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } +static void mptcp_update_infinite_map(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_ext *mpext) +{ + if (!mpext) + return; + + mpext->infinite_map = 1; + mpext->data_len = 0; + + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); + mptcp_subflow_ctx(ssk)->send_infinite_map = 0; + pr_fallback(msk); + mptcp_do_fallback(ssk); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1248,6 +1246,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, info->limit > dfrag->data_len)) return 0; + if (unlikely(!__tcp_can_send(ssk))) + return -EAGAIN; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; @@ -1268,7 +1269,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } @@ -1286,7 +1287,7 @@ alloc_skb: } /* Zero window and all data acked? Probe. */ - copy = mptcp_check_allowed_size(msk, data_seq, copy); + copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); @@ -1357,6 +1358,9 @@ alloc_skb: out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); + if (mptcp_subflow_ctx(ssk)->send_infinite_map) + mptcp_update_infinite_map(msk, ssk, mpext); + trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } @@ -1369,7 +1373,7 @@ out: struct subflow_send_info { struct sock *ssk; - u64 ratio; + u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) @@ -1394,27 +1398,32 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return __mptcp_subflow_active(subflow); } +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { - struct subflow_send_info send_info[2]; + struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; + u64 linger_time; long tout = 0; - u64 ratio; - u32 pace; sock_owned_by_me(sk); if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; - return sk_stream_memory_free(msk->first) ? msk->first : NULL; + return __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first) ? msk->first : NULL; } /* re-use last subflow, if the burst allow that */ @@ -1426,10 +1435,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } /* pick the subflow with the lower wmem/wspace ratio */ - for (i = 0; i < 2; ++i) { + for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; - send_info[i].ratio = -1; + send_info[i].linger_time = -1; } + mptcp_for_each_subflow(msk, subflow) { trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); @@ -1438,34 +1448,56 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !subflow->backup; - if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) - continue; - - pace = READ_ONCE(ssk->sk_pacing_rate); - if (!pace) - continue; + pace = subflow->avg_pacing_rate; + if (unlikely(!pace)) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } - ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, - pace); - if (ratio < send_info[subflow->backup].ratio) { + linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].ratio = ratio; + send_info[subflow->backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) - send_info[0].ssk = send_info[1].ssk; + send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; + + /* According to the blest algorithm, to avoid HoL blocking for the + * faster flow, we need to: + * - estimate the faster flow linger time + * - use the above to estimate the amount of byte transferred + * by the faster flow + * - check that the amount of queued data is greter than the above, + * otherwise do not use the picked, slower, subflow + * We select the subflow with the shorter estimated time to flush + * the queued mem, which basically ensure the above. We just need + * to check that subflow has a non empty cwin. + */ + ssk = send_info[SSK_MODE_ACTIVE].ssk; + if (!ssk || !sk_stream_memory_free(ssk)) + return NULL; - if (send_info[0].ssk) { - msk->last_snd = send_info[0].ssk; - msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - tcp_sk(msk->last_snd)->snd_wnd); - return msk->last_snd; + burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + wmem = READ_ONCE(ssk->sk_wmem_queued); + if (!burst) { + msk->last_snd = NULL; + return ssk; } - return NULL; + subflow = mptcp_subflow_ctx(ssk); + subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + + READ_ONCE(ssk->sk_pacing_rate) * burst, + burst + wmem); + msk->last_snd = ssk; + msk->snd_burst = burst; + return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) @@ -1499,11 +1531,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, msk->snd_nxt = snd_nxt_new; } -static void mptcp_check_and_set_pending(struct sock *sk) +void mptcp_check_and_set_pending(struct sock *sk) { - if (mptcp_send_head(sk) && - !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + if (mptcp_send_head(sk)) + mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } void __mptcp_push_pending(struct sock *sk, unsigned int flags) @@ -1513,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) struct mptcp_sendmsg_info info = { .flags = flags, }; + bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len, copied = 0; + int len; while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@ -1524,7 +1556,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) int ret = 0; prev_ssk = ssk; - mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); /* First check. If the ssk has changed since @@ -1544,12 +1575,14 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { + if (ret == -EAGAIN) + continue; mptcp_push_release(ssk, &info); goto out; } + do_check_data_fin = true; info.sent += ret; - copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); @@ -1565,7 +1598,7 @@ out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - if (copied) + if (do_check_data_fin) __mptcp_check_send_data_fin(sk); } @@ -1640,10 +1673,42 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, + size_t len, int *copied_syn) +{ + unsigned int saved_flags = msg->msg_flags; + struct mptcp_sock *msk = mptcp_sk(sk); + int ret; + + lock_sock(ssk); + msg->msg_flags |= MSG_DONTWAIT; + msk->connect_flags = O_NONBLOCK; + msk->is_sendmsg = 1; + ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); + msk->is_sendmsg = 0; + msg->msg_flags = saved_flags; + release_sock(ssk); + + /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ + if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { + ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, msg->msg_flags, 1); + + /* Keep the same behaviour of plain TCP: zero the copied bytes in + * case of any error, except timeout or signal + */ + if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) + *copied_syn = 0; + } + + return ret; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; + struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1657,14 +1722,30 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + int copied_syn = 0; + + ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); + copied += copied_syn; + if (ret == -EINPROGRESS && copied_syn > 0) + goto out; + else if (ret) + goto do_error; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) - goto out; + goto do_error; } + ret = -EPIPE; + if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) + goto do_error; + pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { @@ -1673,11 +1754,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) bool dfrag_collapsed; size_t psize, offset; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ @@ -1709,7 +1785,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { ret = -EFAULT; - goto out; + goto do_error; } /* data successfully copied into the write queue */ @@ -1741,7 +1817,7 @@ wait_for_memory: __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) - goto out; + goto do_error; } if (copied) @@ -1749,7 +1825,14 @@ wait_for_memory: out: release_sock(sk); - return copied ? : ret; + return copied; + +do_error: + if (copied) + goto out; + + copied = sk_stream_error(sk, msg->msg_flags, ret); + goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, @@ -1784,8 +1867,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, copied += count; if (count < data_len) { - if (!(flags & MSG_PEEK)) + if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; + MPTCP_SKB_CB(skb)->map_seq += count; + } break; } @@ -1851,7 +1936,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvmem, rcvbuf; u64 rcvwin, grow; @@ -1869,7 +1954,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(rcvwin, advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; @@ -1927,7 +2012,6 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool ret, done; - mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; @@ -1965,8 +2049,29 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) return !skb_queue_empty(&msk->receive_queue); } +static unsigned int mptcp_inq_hint(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + const struct sk_buff *skb; + + skb = skb_peek(&msk->receive_queue); + if (skb) { + u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + + if (hint_val >= INT_MAX) + return INT_MAX; + + return (unsigned int)hint_val; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 1; + + return 0; +} + static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) + int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; @@ -1984,11 +2089,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; } - timeo = sock_rcvtimeo(sk, nonblock); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + if (unlikely(msk->recvmsg_inq)) + cmsg_flags = MPTCP_CMSG_INQ; + while (copied < len) { int bytes_read; @@ -2062,6 +2170,12 @@ out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); + + if (cmsg_flags & MPTCP_CMSG_INQ) { + unsigned int inq = mptcp_inq_hint(sk); + + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d", @@ -2088,7 +2202,7 @@ static void mptcp_retransmit_timer(struct timer_list *t) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ - set_bit(MPTCP_RETRANSMIT, &msk->flags); + __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); @@ -2196,6 +2310,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) return true; } +/* flags for __mptcp_close_ssk() */ +#define MPTCP_CF_PUSH BIT(1) +#define MPTCP_CF_FASTCLOSE BIT(2) + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -2205,22 +2323,43 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow) + struct mptcp_subflow_context *subflow, + unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push; + bool need_push, dispose_it; - list_del(&subflow->node); + dispose_it = !msk->subflow || ssk != msk->subflow->sk; + if (dispose_it) + list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + if (flags & MPTCP_CF_FASTCLOSE) { + /* be sure to force the tcp_disconnect() path, + * to generate the egress reset + */ + ssk->sk_lingertime = 0; + sock_set_flag(ssk, SOCK_LINGER); + subflow->send_fastclose = 1; + } + + need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); + if (!dispose_it) { + tcp_disconnect(ssk, 0); + msk->subflow->state = SS_UNCONNECTED; + mptcp_subflow_ctx_reset(subflow); + release_sock(ssk); + + goto out; + } + /* if we are invoked by the msk cleanup code, the subflow is * already orphaned */ if (ssk->sk_socket) sock_orphan(ssk); - need_push = __mptcp_retransmit_pending_data(sk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2231,6 +2370,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(ssk); + inet_csk_listen_stop(ssk); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ @@ -2240,14 +2384,12 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, sock_put(ssk); - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (ssk == msk->first) msk->first = NULL; - if (msk->subflow && ssk == msk->subflow->sk) - mptcp_dispose_initial_subflow(msk); +out: + if (ssk == msk->last_snd) + msk->last_snd = NULL; if (need_push) __mptcp_push_pending(sk, 0); @@ -2258,7 +2400,13 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, { if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); - __mptcp_close_ssk(sk, ssk, subflow); + + /* subflow aborted before reaching the fully_established status + * attempt the creation of the next subflow + */ + mptcp_pm_subflow_check_next(mptcp_sk(sk), ssk, subflow); + + __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -2272,7 +2420,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) might_sleep(); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (inet_sk_state_load(ssk) != TCP_CLOSE) @@ -2315,7 +2463,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) mptcp_token_destroy(msk); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; @@ -2327,12 +2475,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) unlock_sock_fast(tcp_sk, slow); } + /* Mirror the tcp_reset() error propagation */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); - mptcp_close_wake_up(sk); + /* the calling mptcp_worker will properly destroy the socket */ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) @@ -2387,6 +2554,7 @@ static void __mptcp_retrans(struct sock *sk) dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); @@ -2398,10 +2566,60 @@ reset_timer: mptcp_reset_timer(sk); } +/* schedule the timeout timer for the relevant event: either close timeout + * or mp_fail timeout. The close timeout takes precedence on the mp_fail one + */ +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) +{ + struct sock *sk = (struct sock *)msk; + unsigned long timeout, close_timeout; + + if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) + return; + + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; + + /* the close timeout takes precedence on the fail one, and here at least one of + * them is active + */ + timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; + + sk_reset_timer(sk, &sk->sk_timer, timeout); +} + +static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) +{ + struct sock *ssk = msk->first; + bool slow; + + if (!ssk) + return; + + pr_debug("MP_FAIL doesn't respond, reset the subflow"); + + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); + unlock_sock_fast(ssk, slow); + + mptcp_reset_timeout(msk, 0); +} + +static void mptcp_do_fastclose(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), + subflow, MPTCP_CF_FASTCLOSE); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned long fail_tout; int state; lock_sock(sk); @@ -2410,12 +2628,10 @@ static void mptcp_worker(struct work_struct *work) goto unlock; mptcp_check_data_fin_ack(sk); - mptcp_flush_join_list(msk); mptcp_check_fastclose(msk); - if (msk->pm.status) - mptcp_pm_nl_work(msk); + mptcp_pm_nl_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); @@ -2427,11 +2643,15 @@ static void mptcp_worker(struct work_struct *work) * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state */ - if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { - inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_destroy_sock(sk); - goto unlock; + if (sock_flag(sk, SOCK_DEAD)) { + if (mptcp_check_close_timeout(sk)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + goto unlock; + } } if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2440,6 +2660,10 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); + fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; + if (fail_tout && time_after(jiffies, fail_tout)) + mptcp_mp_fail_no_response(msk); + unlock: release_sock(sk); sock_put(sk); @@ -2449,8 +2673,6 @@ static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - spin_lock_init(&msk->join_list_lock); - INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); @@ -2465,6 +2687,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; mptcp_pm_data_init(msk); @@ -2476,9 +2699,20 @@ static int __mptcp_init_sock(struct sock *sk) return 0; } -static int mptcp_init_sock(struct sock *sk) +static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_assign_congestion_control(sk); + strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); + + /* no need to keep a reference to the ops, the name will suffice */ + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = NULL; +} + +static int mptcp_init_sock(struct sock *sk) +{ struct net *net = sock_net(sk); int ret; @@ -2499,16 +2733,11 @@ static int mptcp_init_sock(struct sock *sk) /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ - tcp_assign_congestion_control(sk); - strcpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name); - - /* no need to keep a reference to the ops, the name will suffice */ - tcp_cleanup_congestion_control(sk); - icsk->icsk_ca_ops = NULL; + mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); return 0; } @@ -2523,7 +2752,7 @@ static void __mptcp_clear_xmit(struct sock *sk) dfrag_clear(sk, dfrag); } -static void mptcp_cancel_work(struct sock *sk) +void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2609,6 +2838,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) * state now */ if (__mptcp_check_fallback(msk)) { + WRITE_ONCE(msk->snd_una, msk->write_seq); if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_close_wake_up(sk); @@ -2617,7 +2847,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk) } } - mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2642,31 +2871,16 @@ static void __mptcp_wr_shutdown(struct sock *sk) static void __mptcp_destroy_sock(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); pr_debug("msk=%p", msk); might_sleep(); - /* be sure to always acquire the join list lock, to sync vs - * mptcp_finish_join(). - */ - spin_lock_bh(&msk->join_list_lock); - list_splice_tail_init(&msk->join_list, &msk->conn_list); - spin_unlock_bh(&msk->join_list_lock); - list_splice_init(&msk->conn_list, &conn_list); - - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; - list_for_each_entry_safe(subflow, tmp, &conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow); - } - sk->sk_prot->destroy(sk); WARN_ON_ONCE(msk->rmem_fwd_alloc); @@ -2675,16 +2889,27 @@ static void __mptcp_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); - mptcp_dispose_initial_subflow(msk); sock_put(sk); } -static void mptcp_close(struct sock *sk, long timeout) +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; +} + +bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; - lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { @@ -2692,18 +2917,29 @@ static void mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_close_state(sk)) + if (mptcp_check_readable(msk)) { + /* the msk has read data, do the MPTCP equivalent of TCP reset */ + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); + } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + /* since the close timeout takes precedence on the fail one, + * cancel the latter + */ + if (ssk == msk->first) + subflow->fail_tout = 0; + sock_orphan(ssk); unlock_sock_fast(ssk, slow); } @@ -2711,23 +2947,34 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); + if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + mptcp_reset_timeout(msk, 0); } + + return do_cancel_work; +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + bool do_cancel_work; + + lock_sock(sk); + + do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); - if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); - sock_put(sk); } -static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -2752,18 +2999,37 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_do_flush_join_list(msk); + inet_sk_state_store(sk, TCP_CLOSE); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_stop_timer(sk); + sk_stop_timer(sk, &sk->sk_timer); - lock_sock(ssk); - tcp_disconnect(ssk, flags); - release_sock(ssk); - } + if (mptcp_sk(sk)->token) + mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + + /* msk->subflow is still intact, the following will not free the first + * subflow + */ + mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + msk->last_snd = NULL; + WRITE_ONCE(msk->flags, 0); + msk->cb_flags = 0; + msk->push_pending = 0; + msk->recovery = false; + msk->can_ack = false; + msk->fully_established = false; + msk->rcv_data_fin = false; + msk->snd_data_fin_enable = false; + msk->rcv_fastclose = false; + msk->use_64bit_ack = false; + WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + mptcp_pm_data_reset(msk); + mptcp_ca_reset(sk); + + sk->sk_shutdown = 0; + sk_error_report(sk); return 0; } @@ -2815,7 +3081,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -2879,7 +3145,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; - return newsk; + goto out; } /* acquire the 2nd reference for the owning socket */ @@ -2891,19 +3157,28 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } +out: + newsk->sk_kern_sock = kern; return newsk; } -void mptcp_destroy_common(struct mptcp_sock *msk) +void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { + struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); + /* join list will be eventually flushed (with rst) at sock lock release time */ + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); + mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it @@ -2912,13 +3187,18 @@ void mptcp_destroy_common(struct mptcp_sock *msk) msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); + mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - mptcp_destroy_common(msk); + /* clears msk->subflow, allowing the following to close + * even the initial subflow + */ + mptcp_dispose_initial_subflow(msk); + mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } @@ -2927,7 +3207,7 @@ void __mptcp_data_acked(struct sock *sk) if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else - set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); @@ -2946,20 +3226,23 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) else if (xmit_ssk) mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND); } else { - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } } +#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ + BIT(MPTCP_RETRANSMIT) | \ + BIT(MPTCP_FLUSH_JOIN_LIST)) + /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) + __must_hold(&sk->sk_lock.slock) { - for (;;) { - unsigned long flags = 0; + struct mptcp_sock *msk = mptcp_sk(sk); - if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) - flags |= BIT(MPTCP_PUSH_PENDING); - if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) - flags |= BIT(MPTCP_RETRANSMIT); + for (;;) { + unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | + msk->push_pending; if (!flags) break; @@ -2970,8 +3253,11 @@ static void mptcp_release_cb(struct sock *sk) * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ - + msk->push_pending = 0; + msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); + if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) + __mptcp_flush_join_list(sk); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) @@ -2981,15 +3267,19 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } - /* be sure to set the current sk state before tacking actions - * depending on sk_state - */ - if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) - __mptcp_set_connected(sk); - if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) - __mptcp_error_report(sk); + if (unlikely(&msk->cb_flags)) { + /* be sure to set the current sk state before tacking actions + * depending on sk_state, that is processing MPTCP_ERROR_REPORT + */ + if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) + __mptcp_set_connected(sk); + if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) + __mptcp_error_report(sk); + if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) + msk->last_snd = NULL; + } __mptcp_update_rmem(sk); } @@ -3030,7 +3320,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk) if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk); else - set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); + __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); mptcp_subflow_delegated_done(subflow, MPTCP_DELEGATE_SEND); } @@ -3093,9 +3383,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); @@ -3116,8 +3406,7 @@ bool mptcp_finish_join(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; - struct socket *parent_sock; - bool ret; + bool ret = true; pr_debug("msk=%p, subflow=%p", msk, subflow); @@ -3127,38 +3416,39 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - if (!msk->pm.server_side) + if (!list_empty(&subflow->node)) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) { - subflow->reset_reason = MPTCP_RST_EPROHIBIT; - return false; - } + if (!mptcp_pm_allow_new_subflow(msk)) + goto err_prohibited; - /* active connections are already on conn_list, and we can't acquire - * msk lock here. - * use the join list lock as synchronization point and double-check - * msk status to avoid racing with __mptcp_destroy_sock() + /* active connections are already on conn_list. + * If we can't acquire msk socket lock here, let the release callback + * handle it */ - spin_lock_bh(&msk->join_list_lock); - ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { - list_add_tail(&subflow->node, &msk->join_list); + mptcp_data_lock(parent); + if (!sock_owned_by_user(parent)) { + ret = __mptcp_finish_join(msk, ssk); + if (ret) { + sock_hold(ssk); + list_add_tail(&subflow->node, &msk->conn_list); + } + } else { sock_hold(ssk); + list_add_tail(&subflow->node, &msk->join_list); + __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } - spin_unlock_bh(&msk->join_list_lock); + mptcp_data_unlock(parent); + if (!ret) { +err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } - /* attach to msk socket only after we are sure he will deal with us - * at close time - */ - parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !ssk->sk_socket) - mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); + out: mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); return true; @@ -3177,10 +3467,135 @@ static int mptcp_forward_alloc_get(const struct sock *sk) return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; } +static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) +{ + const struct sock *sk = (void *)msk; + u64 delta; + + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + return 0; + + delta = msk->write_seq - v; + if (__mptcp_check_fallback(msk) && msk->first) { + struct tcp_sock *tp = tcp_sk(msk->first); + + /* the first subflow is disconnected after close - see + * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq + * so ignore that status, too. + */ + if (!((1 << msk->first->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) + delta += READ_ONCE(tp->write_seq) - tp->snd_una; + } + if (delta > INT_MAX) + delta = INT_MAX; + + return (int)delta; +} + +static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + bool slow; + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + __mptcp_move_skbs(msk); + answ = mptcp_inq_hint(sk); + release_sock(sk); + break; + case SIOCOUTQ: + slow = lock_sock_fast(sk); + answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); + unlock_sock_fast(sk, slow); + break; + case SIOCOUTQNSD: + slow = lock_sock_fast(sk); + answ = mptcp_ioctl_outq(msk, msk->snd_nxt); + unlock_sock_fast(sk, slow); + break; + default: + return -ENOIOCTLCMD; + } + + return put_user(answ, (int __user *)arg); +} + +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + +static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + int err = -EINVAL; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + return -EINVAL; + + mptcp_token_destroy(msk); + inet_sk_state_store(sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_early_fallback(msk, subflow); +#endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { + MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + mptcp_subflow_early_fallback(msk, subflow); + } + if (likely(!__mptcp_check_fallback(msk))) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); + + /* if reaching here via the fastopen/sendmsg path, the caller already + * acquired the subflow socket lock, too. + */ + if (msk->is_sendmsg) + err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + else + err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + + /* on successful connect, the msk state will be moved to established by + * subflow_finish_connect() + */ + if (unlikely(err && err != -EINPROGRESS)) { + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + return err; + } + + mptcp_copy_inaddrs(sk, ssock->sk); + + /* unblocking connect, mptcp-level inet_stream_connect will error out + * without changing the socket state, update it here. + */ + if (err == -EINPROGRESS) + sk->sk_socket->state = ssock->state; + return err; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, @@ -3189,6 +3604,7 @@ static struct proto mptcp_prot = { .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, + .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, @@ -3196,7 +3612,10 @@ static struct proto mptcp_prot = { .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, .sockets_allocated = &mptcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, + .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), @@ -3228,68 +3647,16 @@ unlock: return err; } -static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) -{ - subflow->request_mptcp = 0; - __mptcp_do_fallback(msk); -} - static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { - struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; - struct socket *ssock; - int err; + int ret; lock_sock(sock->sk); - if (sock->state != SS_UNCONNECTED && msk->subflow) { - /* pending connection or invalid state, let existing subflow - * cope with that - */ - ssock = msk->subflow; - goto do_connect; - } - - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; - goto unlock; - } - - mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); -#ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_early_fallback(msk, subflow); -#endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); - mptcp_subflow_early_fallback(msk, subflow); - } - if (likely(!__mptcp_check_fallback(msk))) - MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); - -do_connect: - err = ssock->ops->connect(ssock, uaddr, addr_len, flags); - sock->state = ssock->state; - - /* on successful connect, the msk state will be moved to established by - * subflow_finish_connect() - */ - if (!err || err == -EINPROGRESS) - mptcp_copy_inaddrs(sock->sk, ssock->sk); - else - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - -unlock: + mptcp_sk(sock->sk)->connect_flags = flags; + ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); - return err; + return ret; } static int mptcp_listen(struct socket *sock, int backlog) @@ -3330,17 +3697,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - lock_sock(sock->sk); - if (sock->sk->sk_state != TCP_LISTEN) - goto unlock_fail; - ssock = __mptcp_nmpc_socket(msk); if (!ssock) - goto unlock_fail; - - clear_bit(MPTCP_DATA_READY, &msk->flags); - sock_hold(ssock->sk); - release_sock(sock->sk); + return -EINVAL; err = ssock->ops->accept(sock, newsock, flags, kern); if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { @@ -3363,14 +3722,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (mptcp_is_fully_established(newsk)) mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); - mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); mptcp_propagate_sndbuf(newsk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ - mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -3380,26 +3737,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, release_sock(newsk); } - if (inet_csk_listen_poll(ssock->sk)) - set_bit(MPTCP_DATA_READY, &msk->flags); - sock_put(ssock->sk); return err; - -unlock_fail: - release_sock(sock->sk); - return -EINVAL; -} - -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) -{ - /* Concurrent splices from sk_receive_queue into receive_queue will - * always show at least one non-empty queue when checked in this order. - */ - if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && - skb_queue_empty_lockless(&msk->receive_queue)) - return 0; - - return EPOLLIN | EPOLLRDNORM; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) @@ -3433,19 +3771,26 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); - if (state == TCP_LISTEN) - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; + if (state == TCP_LISTEN) { + if (WARN_ON_ONCE(!msk->subflow || !msk->subflow->sk)) + return 0; + + return inet_csk_listen_poll(msk->subflow->sk); + } if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + /* cf tcp_poll() note about TFO */ + mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (sk->sk_err) mask |= EPOLLERR; @@ -3530,8 +3875,8 @@ void __init mptcp_proto_init(void) for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); - netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, + mptcp_napi_poll); napi_enable(&delegated->napi); } |