diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 568 |
1 files changed, 393 insertions, 175 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3b75836db19b..54836a6b81d6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); +DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); @@ -429,7 +431,7 @@ void tcp_init_sock(struct sock *sk) * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = TCP_INIT_CWND; + tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; @@ -441,7 +443,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; + tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -452,9 +454,10 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); - WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); @@ -686,9 +689,10 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && - sock_net(sk)->ipv4.sysctl_tcp_autocorking && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && - refcount_read(&sk->sk_wmem_alloc) > skb->truesize; + refcount_read(&sk->sk_wmem_alloc) > skb->truesize && + tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, @@ -855,9 +859,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, { struct sk_buff *skb; - if (unlikely(tcp_under_memory_pressure(sk))) - sk_mem_reclaim_partial(sk); - skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; @@ -893,8 +894,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ - new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; - new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; @@ -936,6 +936,40 @@ void tcp_remove_empty_skb(struct sock *sk) } } +/* skb changing from pure zc to mixed, must charge zc */ +static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) +{ + if (unlikely(skb_zcopy_pure(skb))) { + u32 extra = skb->truesize - + SKB_TRUESIZE(skb_end_offset(skb)); + + if (!sk_wmem_schedule(sk, extra)) + return -ENOMEM; + + sk_mem_charge(sk, extra); + skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + } + return 0; +} + + +static int tcp_wmem_schedule(struct sock *sk, int copy) +{ + int left; + + if (likely(sk_wmem_schedule(sk, copy))) + return copy; + + /* We could be in trouble if we have nothing queued. + * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] + * to guarantee some progress. + */ + left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; + if (left > 0) + sk_forced_mem_schedule(sk, min(left, copy)); + return min(copy, sk->sk_forward_alloc); +} + static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, struct page *page, int offset, size_t *size) { @@ -967,18 +1001,22 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_wmem_schedule(sk, copy)) + if (tcp_downgrade_zcopy_pure(sk, skb)) + return NULL; + + copy = tcp_wmem_schedule(sk, copy); + if (!copy) return NULL; if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); + skb_fill_page_desc_noacc(skb, i, page, offset, copy); } if (!(flags & MSG_NO_SHARED_FRAGS)) @@ -1125,16 +1163,16 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, - int *copied, size_t size, - struct ubuf_info *uarg) +int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1186,17 +1224,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) flags = msg->msg_flags; - if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { + if ((flags & MSG_ZEROCOPY) && size) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); - if (!uarg) { - err = -ENOBUFS; - goto out_err; - } - zc = sk->sk_route_caps & NETIF_F_SG; - if (!zc) - uarg->zerocopy = 0; + if (msg->msg_ubuf) { + uarg = msg->msg_ubuf; + net_zcopy_get(uarg); + zc = sk->sk_route_caps & NETIF_F_SG; + } else if (sock_flag(sk, SOCK_ZEROCOPY)) { + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + if (!uarg) { + err = -ENOBUFS; + goto out_err; + } + zc = sk->sk_route_caps & NETIF_F_SG; + if (!zc) + uarg_to_msgzc(uarg)->zerocopy = 0; + } } if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && @@ -1310,7 +1354,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= sysctl_max_skb_frags) { + if (i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1319,16 +1363,14 @@ new_segment: copy = min_t(int, copy, pfrag->size - pfrag->offset); - /* skb changing from pure zc to mixed, must charge zc */ - if (unlikely(skb_zcopy_pure(skb))) { - if (!sk_wmem_schedule(sk, skb->data_len)) + if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { + if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; - - sk_mem_charge(sk, skb->data_len); - skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; + skb_zcopy_downgrade_managed(skb); } - if (!sk_wmem_schedule(sk, copy)) + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, @@ -1355,7 +1397,8 @@ new_segment: skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { - if (!sk_wmem_schedule(sk, copy)) + copy = tcp_wmem_schedule(sk, copy); + if (!copy) goto wait_for_space; } @@ -1524,17 +1567,11 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -void tcp_cleanup_rbuf(struct sock *sk, int copied) +static void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), - "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", - tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); - if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1580,19 +1617,16 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -void __sk_defer_free_flush(struct sock *sk) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { - struct llist_node *head; - struct sk_buff *skb, *n; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + struct tcp_sock *tp = tcp_sk(sk); - head = llist_del_all(&sk->defer_list); - llist_for_each_entry_safe(skb, n, head, ll_node) { - prefetch(n); - skb_mark_not_on_list(skb); - __kfree_skb(skb); - } + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); + __tcp_cleanup_rbuf(sk, copied); } -EXPORT_SYMBOL(__sk_defer_free_flush); static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { @@ -1601,16 +1635,12 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; - if (!skb_queue_empty(&sk->sk_receive_queue) || - !llist_empty(&sk->defer_list)) { - llist_add(&skb->ll_node, &sk->defer_list); - return; - } + return skb_attempt_defer_free(skb); } __kfree_skb(skb); } -static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; @@ -1633,6 +1663,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) } return NULL; } +EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines @@ -1675,11 +1706,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, if (!copied) copied = used; break; - } else if (used <= len) { - seq += used; - copied += used; - offset += used; } + if (WARN_ON_ONCE(used > len)) + used = len; + seq += used; + copied += used; + offset += used; + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it @@ -1717,6 +1750,90 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + struct sk_buff *skb; + int copied = 0; + u32 offset; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + u8 tcp_flags; + int used; + + __skb_unlink(skb, &sk->sk_receive_queue); + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); + tcp_flags = TCP_SKB_CB(skb)->tcp_flags; + used = recv_actor(sk, skb); + consume_skb(skb); + if (used < 0) { + if (!copied) + copied = used; + break; + } + seq += used; + copied += used; + + if (tcp_flags & TCPHDR_FIN) { + ++seq; + break; + } + } + WRITE_ONCE(tp->copied_seq, seq); + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (copied > 0) + __tcp_cleanup_rbuf(sk, copied); + + return copied; +} +EXPORT_SYMBOL(tcp_read_skb); + +void tcp_read_done(struct sock *sk, size_t len) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + struct sk_buff *skb; + size_t left; + u32 offset; + + if (sk->sk_state == TCP_LISTEN) + return; + + left = len; + while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + int used; + + used = min_t(size_t, skb->len - offset, left); + seq += used; + left -= used; + + if (skb->len > offset + used) + break; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_eat_recv_skb(sk, skb); + ++seq; + break; + } + tcp_eat_recv_skb(sk, skb); + } + WRITE_ONCE(tp->copied_seq, seq); + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (left != len) + tcp_cleanup_rbuf(sk, len - left); +} +EXPORT_SYMBOL(tcp_read_done); + int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); @@ -1731,7 +1848,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else - cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); @@ -1866,8 +1983,7 @@ static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, - struct scm_timestamping_internal *tss, + int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, @@ -1889,7 +2005,7 @@ static int receive_fallback_to_copy(struct sock *sk, if (err) return err; - err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0, + err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, tss, &zc->msg_flags); if (err < 0) return err; @@ -2305,8 +2421,7 @@ static int tcp_inq_hint(struct sock *sk) */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, - struct scm_timestamping_internal *tss, + int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -2324,9 +2439,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, if (sk->sk_state == TCP_LISTEN) goto out; - if (tp->recvmsg_inq) + if (tp->recvmsg_inq) { *cmsg_flags = TCP_CMSG_INQ; - timeo = sock_rcvtimeo(sk, nonblock); + msg->msg_get_inq = 1; + } + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) @@ -2444,7 +2561,6 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); - sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } @@ -2545,10 +2661,10 @@ recv_sndq: goto out; } -int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, - int flags, int *addr_len) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, + int *addr_len) { - int cmsg_flags = 0, ret, inq; + int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) @@ -2557,20 +2673,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) - sk_busy_loop(sk, nonblock); + sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); - ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, - &cmsg_flags); + ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); - sk_defer_free_flush(sk); - if (cmsg_flags && ret >= 0) { + if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); - if (cmsg_flags & TCP_CMSG_INQ) { - inq = tcp_inq_hint(sk); - put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + if (msg->msg_get_inq) { + msg->msg_inq = tcp_inq_hint(sk); + if (cmsg_flags & TCP_CMSG_INQ) + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; @@ -2724,7 +2840,8 @@ static void tcp_orphan_update(struct timer_list *unused) static bool tcp_too_many_orphans(int shift) { - return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans; + return READ_ONCE(tcp_orphan_cache) << shift > + READ_ONCE(sysctl_tcp_max_orphans); } bool tcp_check_oom(struct sock *sk, int shift) @@ -2771,8 +2888,6 @@ void __tcp_close(struct sock *sk, long timeout) __kfree_skb(skb); } - sk_mem_reclaim(sk); - /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; @@ -2880,7 +2995,6 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_mem_reclaim(sk); if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); @@ -2958,7 +3072,6 @@ void tcp_write_queue_purge(struct sock *sk) } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); - sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; @@ -3022,8 +3135,10 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tp->snd_cwnd = TCP_INIT_CWND; + tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; + tp->is_cwnd_limited = 0; + tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; @@ -3088,7 +3203,6 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } - sk_defer_free_flush(sk); sk_error_report(sk); return 0; } @@ -3096,7 +3210,7 @@ EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { - return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && + return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } @@ -3373,8 +3487,8 @@ int tcp_set_window_clamp(struct sock *sk, int val) /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3396,11 +3510,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; name[val] = 0; - lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, - ns_capable(sock_net(sk)->user_ns, - CAP_NET_ADMIN)); - release_sock(sk); + sockopt_lock_sock(sk); + err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), + sockopt_ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); + sockopt_release_sock(sk); return err; } case TCP_ULP: { @@ -3416,9 +3530,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; name[val] = 0; - lock_sock(sk); + sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); - release_sock(sk); + sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { @@ -3451,7 +3565,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case TCP_MAXSEG: @@ -3533,7 +3647,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; - else if (sk->sk_state == TCP_ESTABLISHED) + else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; @@ -3626,7 +3740,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & + TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else @@ -3672,7 +3787,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; } - release_sock(sk); + sockopt_release_sock(sk); return err; } @@ -3682,8 +3797,9 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, + optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); @@ -3733,7 +3849,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; - info->tcpi_snd_cwnd = tp->snd_cwnd; + info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : @@ -3904,7 +4020,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); - nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); @@ -3936,15 +4052,15 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, return stats; } -static int do_tcp_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); @@ -3976,12 +4092,13 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : + READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; + val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, @@ -3993,15 +4110,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_INFO: { struct tcp_info info; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } @@ -4011,7 +4128,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, size_t sz = 0; int attr; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; @@ -4019,9 +4136,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } @@ -4030,27 +4147,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_CONGESTION: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) + if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { - if (put_user(0, optlen)) + len = 0; + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len)) + if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; @@ -4058,15 +4176,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, key, len)) + if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } @@ -4092,7 +4210,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) @@ -4107,7 +4225,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; - if (copy_to_user(optval, &opt, len)) + if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } @@ -4153,35 +4271,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->save_syn; break; case TCP_SAVED_SYN: { - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; - lock_sock(sk); + sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { - if (put_user(tcp_saved_syn_len(tp->saved_syn), - optlen)) { - release_sock(sk); + len = tcp_saved_syn_len(tp->saved_syn); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); return -EFAULT; } - release_sock(sk); + sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); - if (put_user(len, optlen)) { - release_sock(sk); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); return -EFAULT; } - if (copy_to_user(optval, tp->saved_syn->data, len)) { - release_sock(sk); + if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { + sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); - release_sock(sk); + sockopt_release_sock(sk); } else { - release_sock(sk); + sockopt_release_sock(sk); len = 0; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; @@ -4192,32 +4310,31 @@ static int do_tcp_getsockopt(struct sock *sk, int level, struct tcp_zerocopy_receive zc = {}; int err; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { - err = check_zeroed_user(optval + sizeof(zc), - len - sizeof(zc)); + err = check_zeroed_sockptr(optval, sizeof(zc), + len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_user(&zc, optval, len)) + if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; - lock_sock(sk); + sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); - release_sock(sk); - sk_defer_free_flush(sk); + sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { @@ -4247,7 +4364,7 @@ zerocopy_rcv_sk_err: zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: - if (!err && copy_to_user(optval, &zc, len)) + if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } @@ -4256,9 +4373,9 @@ zerocopy_rcv_out: return -ENOPROTOOPT; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } @@ -4281,9 +4398,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, + optval, optlen); + return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); } EXPORT_SYMBOL(tcp_getsockopt); @@ -4329,12 +4448,16 @@ static void __tcp_alloc_md5sig_pool(void) * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); - tcp_md5sig_pool_populated = true; + /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool() + * and tcp_get_md5sig_pool(). + */ + WRITE_ONCE(tcp_md5sig_pool_populated, true); } bool tcp_alloc_md5sig_pool(void) { - if (unlikely(!tcp_md5sig_pool_populated)) { + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); if (!tcp_md5sig_pool_populated) { @@ -4345,7 +4468,8 @@ bool tcp_alloc_md5sig_pool(void) mutex_unlock(&tcp_md5sig_mutex); } - return tcp_md5sig_pool_populated; + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + return READ_ONCE(tcp_md5sig_pool_populated); } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -4361,7 +4485,8 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { local_bh_disable(); - if (tcp_md5sig_pool_populated) { + /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */ + if (READ_ONCE(tcp_md5sig_pool_populated)) { /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ smp_rmb(); return this_cpu_ptr(&tcp_md5sig_pool); @@ -4423,6 +4548,82 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke } EXPORT_SYMBOL(tcp_md5_hash_key); +/* Called with rcu_read_lock() */ +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + /* + * This gets called for each TCP segment that arrives + * so we want to be efficient. + * We have 3 drop cases: + * o No MD5 hash and one expected. + * o MD5 hash and we're not expecting one. + * o MD5 hash and its wrong. + */ + const __u8 *hash_location = NULL; + struct tcp_md5sig_key *hash_expected; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + int genhash, l3index; + u8 newhash[16]; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family); + hash_location = tcp_parse_md5sig_option(th); + + /* We've parsed the options - do we have a hash? */ + if (!hash_expected && !hash_location) + return SKB_NOT_DROPPED_YET; + + if (hash_expected && !hash_location) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + return SKB_DROP_REASON_TCP_MD5NOTFOUND; + } + + if (!hash_expected && hash_location) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + return SKB_DROP_REASON_TCP_MD5UNEXPECTED; + } + + /* Check the signature. + * To support dual stack listeners, we need to handle + * IPv4-mapped case. + */ + if (family == AF_INET) + genhash = tcp_v4_md5_hash_skb(newhash, + hash_expected, + NULL, skb); + else + genhash = tp->af_specific->calc_md5_hash(newhash, + hash_expected, + NULL, skb); + + if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); + if (family == AF_INET) { + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", + saddr, ntohs(th->source), + daddr, ntohs(th->dest), + genhash ? " tcp_v4_calc_md5_hash failed" + : "", l3index); + } else { + net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", + genhash ? "failed" : "mismatch", + saddr, ntohs(th->source), + daddr, ntohs(th->dest), l3index); + } + return SKB_DROP_REASON_TCP_MD5FAILURE; + } + return SKB_NOT_DROPPED_YET; +} +EXPORT_SYMBOL(tcp_inbound_md5_hash); + #endif void tcp_done(struct sock *sk) @@ -4454,16 +4655,24 @@ EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { - if (!sk_fullsock(sk)) { - if (sk->sk_state == TCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); + int state = inet_sk_state_load(sk); - local_bh_disable(); - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - local_bh_enable(); - return 0; - } - return -EOPNOTSUPP; + if (state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + + local_bh_disable(); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + local_bh_enable(); + return 0; + } + if (state == TCP_TIME_WAIT) { + struct inet_timewait_sock *tw = inet_twsk(sk); + + refcount_inc(&tw->tw_refcnt); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* Don't race with userspace socket closes such as tcp_close. */ @@ -4539,7 +4748,6 @@ void __init tcp_init(void) timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); - inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); @@ -4549,6 +4757,12 @@ void __init tcp_init(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); + tcp_hashinfo.bind2_bucket_cachep = + kmem_cache_create("tcp_bind2_bucket", + sizeof(struct inet_bind2_bucket), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT, + NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -4572,7 +4786,7 @@ void __init tcp_init(void) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", - sizeof(struct inet_bind_hashbucket), + 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, @@ -4581,11 +4795,15 @@ void __init tcp_init(void) 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); + spin_lock_init(&tcp_hashinfo.bhash2[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } + tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; @@ -4596,11 +4814,11 @@ void __init tcp_init(void) max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); - init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); |