aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c628
1 files changed, 439 insertions, 189 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bbb3d39c69af..54836a6b81d6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,8 +292,10 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
+atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
+DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
#if IS_ENABLED(CONFIG_SMC)
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
@@ -303,7 +305,7 @@ EXPORT_SYMBOL(tcp_have_smc);
/*
* Current number of TCP sockets.
*/
-struct percpu_counter tcp_sockets_allocated;
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(tcp_sockets_allocated);
/*
@@ -429,7 +431,7 @@ void tcp_init_sock(struct sock *sk)
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
- tp->snd_cwnd = TCP_INIT_CWND;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
@@ -441,7 +443,7 @@ void tcp_init_sock(struct sock *sk)
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
- tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
+ tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
@@ -452,11 +454,11 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
- WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
- sk->sk_route_forced_caps = NETIF_F_GSO;
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -546,10 +548,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (state != TCP_SYN_SENT &&
(state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
+ u16 urg_data = READ_ONCE(tp->urg_data);
- if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->urg_data)
+ if (unlikely(urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+ !sock_flag(sk, SOCK_URGINLINE))
target++;
if (tcp_stream_is_readable(sk, target))
@@ -574,7 +577,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
} else
mask |= EPOLLOUT | EPOLLWRNORM;
- if (tp->urg_data & TCP_URG_VALID)
+ if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
/* Active TCP fastopen socket with defer_connect
@@ -608,7 +611,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data &&
+ answ = READ_ONCE(tp->urg_data) &&
READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
@@ -686,9 +689,10 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
- sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
!tcp_rtx_queue_empty(sk) &&
- refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
+ refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
+ tcp_skb_can_collapse_to(skb);
}
void tcp_push(struct sock *sk, int flags, int mss_now,
@@ -855,9 +859,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
{
struct sk_buff *skb;
- if (unlikely(tcp_under_memory_pressure(sk)))
- sk_mem_reclaim_partial(sk);
-
skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
@@ -893,8 +894,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return mss_now;
/* Note : tcp_tso_autosize() will eventually split this later */
- new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
- new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
/* We try hard to avoid divides here */
size_goal = tp->gso_segs * mss_now;
@@ -936,6 +936,40 @@ void tcp_remove_empty_skb(struct sock *sk)
}
}
+/* skb changing from pure zc to mixed, must charge zc */
+static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(skb_zcopy_pure(skb))) {
+ u32 extra = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
+ if (!sk_wmem_schedule(sk, extra))
+ return -ENOMEM;
+
+ sk_mem_charge(sk, extra);
+ skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+ }
+ return 0;
+}
+
+
+static int tcp_wmem_schedule(struct sock *sk, int copy)
+{
+ int left;
+
+ if (likely(sk_wmem_schedule(sk, copy)))
+ return copy;
+
+ /* We could be in trouble if we have nothing queued.
+ * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
+ * to guarantee some progress.
+ */
+ left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued;
+ if (left > 0)
+ sk_forced_mem_schedule(sk, min(left, copy));
+ return min(copy, sk->sk_forward_alloc);
+}
+
static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
struct page *page, int offset, size_t *size)
{
@@ -967,18 +1001,22 @@ new_segment:
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (!sk_wmem_schedule(sk, copy))
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ return NULL;
+
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
return NULL;
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
+ skb_fill_page_desc_noacc(skb, i, page, offset, copy);
}
if (!(flags & MSG_NO_SHARED_FRAGS))
@@ -1125,16 +1163,16 @@ void tcp_free_fastopen_req(struct tcp_sock *tp)
}
}
-static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
- int *copied, size_t size,
- struct ubuf_info *uarg)
+int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
+ size_t size, struct ubuf_info *uarg)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
@@ -1186,17 +1224,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
flags = msg->msg_flags;
- if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
+ if ((flags & MSG_ZEROCOPY) && size) {
skb = tcp_write_queue_tail(sk);
- uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
- if (!uarg) {
- err = -ENOBUFS;
- goto out_err;
- }
- zc = sk->sk_route_caps & NETIF_F_SG;
- if (!zc)
- uarg->zerocopy = 0;
+ if (msg->msg_ubuf) {
+ uarg = msg->msg_ubuf;
+ net_zcopy_get(uarg);
+ zc = sk->sk_route_caps & NETIF_F_SG;
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
+ if (!uarg) {
+ err = -ENOBUFS;
+ goto out_err;
+ }
+ zc = sk->sk_route_caps & NETIF_F_SG;
+ if (!zc)
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+ }
}
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
@@ -1310,7 +1354,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i >= sysctl_max_skb_frags) {
+ if (i >= READ_ONCE(sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1319,16 +1363,14 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- /* skb changing from pure zc to mixed, must charge zc */
- if (unlikely(skb_zcopy_pure(skb))) {
- if (!sk_wmem_schedule(sk, skb->data_len))
+ if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
+ if (tcp_downgrade_zcopy_pure(sk, skb))
goto wait_for_space;
-
- sk_mem_charge(sk, skb->data_len);
- skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+ skb_zcopy_downgrade_managed(skb);
}
- if (!sk_wmem_schedule(sk, copy))
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
@@ -1355,7 +1397,8 @@ new_segment:
skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
if (!skb_zcopy_pure(skb)) {
- if (!sk_wmem_schedule(sk, copy))
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
goto wait_for_space;
}
@@ -1466,7 +1509,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = TCP_URG_READ;
+ WRITE_ONCE(tp->urg_data, TCP_URG_READ);
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
@@ -1524,17 +1567,11 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-void tcp_cleanup_rbuf(struct sock *sk, int copied)
+static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
- "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
- tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
-
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1580,7 +1617,30 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
-static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+ __tcp_cleanup_rbuf(sk, copied);
+}
+
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return skb_attempt_defer_free(skb);
+ }
+ __kfree_skb(skb);
+}
+
+struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
u32 offset;
@@ -1599,10 +1659,11 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
+EXPORT_SYMBOL(tcp_recv_skb);
/*
* This routine provides an alternative to tcp_recvmsg() for routines
@@ -1633,7 +1694,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
len = skb->len - offset;
/* Stop reading if we hit a patch of urgent data */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
if (urg_offset < len)
len = urg_offset;
@@ -1645,11 +1706,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
if (!copied)
copied = used;
break;
- } else if (used <= len) {
- seq += used;
- copied += used;
- offset += used;
}
+ if (WARN_ON_ONCE(used > len))
+ used = len;
+ seq += used;
+ copied += used;
+ offset += used;
+
/* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
@@ -1665,11 +1728,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
continue;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
WRITE_ONCE(tp->copied_seq, seq);
@@ -1687,6 +1750,90 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ int copied = 0;
+ u32 offset;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ u8 tcp_flags;
+ int used;
+
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+ tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+ used = recv_actor(sk, skb);
+ consume_skb(skb);
+ if (used < 0) {
+ if (!copied)
+ copied = used;
+ break;
+ }
+ seq += used;
+ copied += used;
+
+ if (tcp_flags & TCPHDR_FIN) {
+ ++seq;
+ break;
+ }
+ }
+ WRITE_ONCE(tp->copied_seq, seq);
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (copied > 0)
+ __tcp_cleanup_rbuf(sk, copied);
+
+ return copied;
+}
+EXPORT_SYMBOL(tcp_read_skb);
+
+void tcp_read_done(struct sock *sk, size_t len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ size_t left;
+ u32 offset;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return;
+
+ left = len;
+ while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ int used;
+
+ used = min_t(size_t, skb->len - offset, left);
+ seq += used;
+ left -= used;
+
+ if (skb->len > offset + used)
+ break;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ tcp_eat_recv_skb(sk, skb);
+ ++seq;
+ break;
+ }
+ tcp_eat_recv_skb(sk, skb);
+ }
+ WRITE_ONCE(tp->copied_seq, seq);
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (left != len)
+ tcp_cleanup_rbuf(sk, len - left);
+}
+EXPORT_SYMBOL(tcp_read_done);
+
int tcp_peek_len(struct socket *sock)
{
return tcp_inq(sock->sk);
@@ -1701,7 +1848,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
else
- cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
val = min(val, cap);
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -1836,8 +1983,7 @@ static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
}
static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags,
- struct scm_timestamping_internal *tss,
+ int flags, struct scm_timestamping_internal *tss,
int *cmsg_flags);
static int receive_fallback_to_copy(struct sock *sk,
struct tcp_zerocopy_receive *zc, int inq,
@@ -1859,7 +2005,7 @@ static int receive_fallback_to_copy(struct sock *sk,
if (err)
return err;
- err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
+ err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
tss, &zc->msg_flags);
if (err < 0)
return err;
@@ -2275,8 +2421,7 @@ static int tcp_inq_hint(struct sock *sk)
*/
static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags,
- struct scm_timestamping_internal *tss,
+ int flags, struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2294,9 +2439,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
if (sk->sk_state == TCP_LISTEN)
goto out;
- if (tp->recvmsg_inq)
+ if (tp->recvmsg_inq) {
*cmsg_flags = TCP_CMSG_INQ;
- timeo = sock_rcvtimeo(sk, nonblock);
+ msg->msg_get_inq = 1;
+ }
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
@@ -2329,7 +2476,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
- if (tp->urg_data && tp->urg_seq == *seq) {
+ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
@@ -2372,10 +2519,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
break;
if (copied) {
- if (sk->sk_err ||
+ if (!timeo ||
+ sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
- !timeo ||
signal_pending(current))
break;
} else {
@@ -2409,13 +2556,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}
}
- tcp_cleanup_rbuf(sk, copied);
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
- release_sock(sk);
- lock_sock(sk);
+ __sk_flush_backlog(sk);
} else {
+ tcp_cleanup_rbuf(sk, copied);
sk_wait_data(sk, &timeo, last);
}
@@ -2435,7 +2580,7 @@ found_ok_skb:
used = len;
/* Do we have urgent data here? */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
@@ -2469,8 +2614,8 @@ found_ok_skb:
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
- tp->urg_data = 0;
+ if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+ WRITE_ONCE(tp->urg_data, 0);
tcp_fast_path_check(sk);
}
@@ -2485,14 +2630,14 @@ skip_copy:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
@@ -2516,10 +2661,10 @@ recv_sndq:
goto out;
}
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len)
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
- int cmsg_flags = 0, ret, inq;
+ int cmsg_flags = 0, ret;
struct scm_timestamping_internal tss;
if (unlikely(flags & MSG_ERRQUEUE))
@@ -2528,19 +2673,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk_can_busy_loop(sk) &&
skb_queue_empty_lockless(&sk->sk_receive_queue) &&
sk->sk_state == TCP_ESTABLISHED)
- sk_busy_loop(sk, nonblock);
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
lock_sock(sk);
- ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
- &cmsg_flags);
+ ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
release_sock(sk);
- if (cmsg_flags && ret >= 0) {
+ if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
- if (cmsg_flags & TCP_CMSG_INQ) {
- inq = tcp_inq_hint(sk);
- put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ if (msg->msg_get_inq) {
+ msg->msg_inq = tcp_inq_hint(sk);
+ if (cmsg_flags & TCP_CMSG_INQ)
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
+ sizeof(msg->msg_inq), &msg->msg_inq);
}
}
return ret;
@@ -2694,7 +2840,8 @@ static void tcp_orphan_update(struct timer_list *unused)
static bool tcp_too_many_orphans(int shift)
{
- return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+ return READ_ONCE(tcp_orphan_cache) << shift >
+ READ_ONCE(sysctl_tcp_max_orphans);
}
bool tcp_check_oom(struct sock *sk, int shift)
@@ -2741,8 +2888,6 @@ void __tcp_close(struct sock *sk, long timeout)
__kfree_skb(skb);
}
- sk_mem_reclaim(sk);
-
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
if (sk->sk_state == TCP_CLOSE)
goto adjudge_to_death;
@@ -2850,7 +2995,6 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- sk_mem_reclaim(sk);
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
@@ -2928,7 +3072,6 @@ void tcp_write_queue_purge(struct sock *sk)
}
tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
- sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
tcp_sk(sk)->packets_out = 0;
inet_csk(sk)->icsk_backoff = 0;
@@ -2964,7 +3107,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
- tp->urg_data = 0;
+ WRITE_ONCE(tp->urg_data, 0);
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -2992,8 +3135,10 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_rto_min = TCP_RTO_MIN;
icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tp->snd_cwnd = TCP_INIT_CWND;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
tp->snd_cwnd_cnt = 0;
+ tp->is_cwnd_limited = 0;
+ tp->max_packets_out = 0;
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
@@ -3012,8 +3157,7 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- dst_release(sk->sk_rx_dst);
- sk->sk_rx_dst = NULL;
+ dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
tp->segs_in = 0;
@@ -3059,7 +3203,6 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
-
sk_error_report(sk);
return 0;
}
@@ -3067,7 +3210,7 @@ EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
- return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
(sk->sk_state != TCP_LISTEN);
}
@@ -3177,7 +3320,7 @@ static void tcp_enable_tx_delay(void)
* TCP_CORK can be set together with TCP_NODELAY and it is stronger than
* TCP_NODELAY.
*/
-static void __tcp_sock_set_cork(struct sock *sk, bool on)
+void __tcp_sock_set_cork(struct sock *sk, bool on)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3205,7 +3348,7 @@ EXPORT_SYMBOL(tcp_sock_set_cork);
* However, when TCP_NODELAY is set we make an explicit push, which overrides
* even TCP_CORK for currently queued segments.
*/
-static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+void __tcp_sock_set_nodelay(struct sock *sk, bool on)
{
if (on) {
tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
@@ -3344,8 +3487,8 @@ int tcp_set_window_clamp(struct sock *sk, int val)
/*
* Socket option code for TCP.
*/
-static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3367,11 +3510,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true,
- ns_capable(sock_net(sk)->user_ns,
- CAP_NET_ADMIN));
- release_sock(sk);
+ sockopt_lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
+ sockopt_ns_capable(sock_net(sk)->user_ns,
+ CAP_NET_ADMIN));
+ sockopt_release_sock(sk);
return err;
}
case TCP_ULP: {
@@ -3387,9 +3530,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
err = tcp_set_ulp(sk, name);
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
case TCP_FASTOPEN_KEY: {
@@ -3422,7 +3565,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case TCP_MAXSEG:
@@ -3504,7 +3647,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_REPAIR_OPTIONS:
if (!tp->repair)
err = -EINVAL;
- else if (sk->sk_state == TCP_ESTABLISHED)
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
err = tcp_repair_options_est(sk, optval, optlen);
else
err = -EPERM;
@@ -3597,7 +3740,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
- } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
@@ -3643,7 +3787,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
@@ -3653,8 +3797,9 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
const struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP)
- return icsk->icsk_af_ops->setsockopt(sk, level, optname,
- optval, optlen);
+ /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
+ optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(tcp_setsockopt);
@@ -3704,7 +3849,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
if (info->tcpi_state == TCP_LISTEN) {
/* listeners aliased fields :
@@ -3774,10 +3919,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
tcp_get_info_chrono_stats(tp, info);
info->tcpi_segs_out = tp->segs_out;
- info->tcpi_segs_in = tp->segs_in;
+
+ /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+ info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+ info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
info->tcpi_min_rtt = tcp_min_rtt(tp);
- info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
@@ -3873,7 +4020,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
rate64 = tcp_compute_delivery_rate(tp);
nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
- nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
@@ -3905,15 +4052,15 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
return stats;
}
-static int do_tcp_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int __user *optlen)
+int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, sockptr_t optval, sockptr_t optlen)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
int val, len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, sizeof(int));
@@ -3945,12 +4092,13 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
- val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
+ val = icsk->icsk_syn_retries ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
break;
case TCP_LINGER2:
val = tp->linger2;
if (val >= 0)
- val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
+ val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
break;
case TCP_DEFER_ACCEPT:
val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
@@ -3962,15 +4110,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_INFO: {
struct tcp_info info;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
tcp_get_info(sk, &info);
len = min_t(unsigned int, len, sizeof(info));
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
@@ -3980,7 +4128,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
size_t sz = 0;
int attr;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
ca_ops = icsk->icsk_ca_ops;
@@ -3988,9 +4136,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
sz = ca_ops->get_info(sk, ~0U, &attr, &info);
len = min_t(unsigned int, len, sz);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
@@ -3999,27 +4147,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_CONGESTION:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+ if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
case TCP_ULP:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
if (!icsk->icsk_ulp_ops) {
- if (put_user(0, optlen))
+ len = 0;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+ if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
return -EFAULT;
return 0;
@@ -4027,15 +4176,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
unsigned int key_len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
key_len = tcp_fastopen_get_cipher(net, icsk, key) *
TCP_FASTOPEN_KEY_LENGTH;
len = min_t(unsigned int, len, key_len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, key, len))
+ if (copy_to_sockptr(optval, key, len))
return -EFAULT;
return 0;
}
@@ -4061,7 +4210,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_REPAIR_WINDOW: {
struct tcp_repair_window opt;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len != sizeof(opt))
@@ -4076,7 +4225,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
opt.rcv_wnd = tp->rcv_wnd;
opt.rcv_wup = tp->rcv_wup;
- if (copy_to_user(optval, &opt, len))
+ if (copy_to_sockptr(optval, &opt, len))
return -EFAULT;
return 0;
}
@@ -4122,35 +4271,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->save_syn;
break;
case TCP_SAVED_SYN: {
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
if (tp->saved_syn) {
if (len < tcp_saved_syn_len(tp->saved_syn)) {
- if (put_user(tcp_saved_syn_len(tp->saved_syn),
- optlen)) {
- release_sock(sk);
+ len = tcp_saved_syn_len(tp->saved_syn);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return -EINVAL;
}
len = tcp_saved_syn_len(tp->saved_syn);
- if (put_user(len, optlen)) {
- release_sock(sk);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn->data, len)) {
- release_sock(sk);
+ if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
tcp_saved_syn_free(tp);
- release_sock(sk);
+ sockopt_release_sock(sk);
} else {
- release_sock(sk);
+ sockopt_release_sock(sk);
len = 0;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
}
return 0;
@@ -4161,31 +4310,31 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
struct tcp_zerocopy_receive zc = {};
int err;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0 ||
len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
if (unlikely(len > sizeof(zc))) {
- err = check_zeroed_user(optval + sizeof(zc),
- len - sizeof(zc));
+ err = check_zeroed_sockptr(optval, sizeof(zc),
+ len - sizeof(zc));
if (err < 1)
return err == 0 ? -EINVAL : err;
len = sizeof(zc);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
}
- if (copy_from_user(&zc, optval, len))
+ if (copy_from_sockptr(&zc, optval, len))
return -EFAULT;
if (zc.reserved)
return -EINVAL;
if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
return -EINVAL;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc, &tss);
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
- release_sock(sk);
+ sockopt_release_sock(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {
@@ -4215,7 +4364,7 @@ zerocopy_rcv_sk_err:
zerocopy_rcv_inq:
zc.inq = tcp_inq_hint(sk);
zerocopy_rcv_out:
- if (!err && copy_to_user(optval, &zc, len))
+ if (!err && copy_to_sockptr(optval, &zc, len))
err = -EFAULT;
return err;
}
@@ -4224,9 +4373,9 @@ zerocopy_rcv_out:
return -ENOPROTOOPT;
}
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -4249,9 +4398,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP)
- return icsk->icsk_af_ops->getsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+ /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
}
EXPORT_SYMBOL(tcp_getsockopt);
@@ -4297,12 +4448,16 @@ static void __tcp_alloc_md5sig_pool(void)
* to memory. See smp_rmb() in tcp_get_md5sig_pool()
*/
smp_wmb();
- tcp_md5sig_pool_populated = true;
+ /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
+ * and tcp_get_md5sig_pool().
+ */
+ WRITE_ONCE(tcp_md5sig_pool_populated, true);
}
bool tcp_alloc_md5sig_pool(void)
{
- if (unlikely(!tcp_md5sig_pool_populated)) {
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+ if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
mutex_lock(&tcp_md5sig_mutex);
if (!tcp_md5sig_pool_populated) {
@@ -4313,7 +4468,8 @@ bool tcp_alloc_md5sig_pool(void)
mutex_unlock(&tcp_md5sig_mutex);
}
- return tcp_md5sig_pool_populated;
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+ return READ_ONCE(tcp_md5sig_pool_populated);
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -4329,7 +4485,8 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
local_bh_disable();
- if (tcp_md5sig_pool_populated) {
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+ if (READ_ONCE(tcp_md5sig_pool_populated)) {
/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
smp_rmb();
return this_cpu_ptr(&tcp_md5sig_pool);
@@ -4391,6 +4548,82 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke
}
EXPORT_SYMBOL(tcp_md5_hash_key);
+/* Called with rcu_read_lock() */
+enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int dif, int sdif)
+{
+ /*
+ * This gets called for each TCP segment that arrives
+ * so we want to be efficient.
+ * We have 3 drop cases:
+ * o No MD5 hash and one expected.
+ * o MD5 hash and we're not expecting one.
+ * o MD5 hash and its wrong.
+ */
+ const __u8 *hash_location = NULL;
+ struct tcp_md5sig_key *hash_expected;
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int genhash, l3index;
+ u8 newhash[16];
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
+ hash_location = tcp_parse_md5sig_option(th);
+
+ /* We've parsed the options - do we have a hash? */
+ if (!hash_expected && !hash_location)
+ return SKB_NOT_DROPPED_YET;
+
+ if (hash_expected && !hash_location) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+ }
+
+ if (!hash_expected && hash_location) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
+ }
+
+ /* Check the signature.
+ * To support dual stack listeners, we need to handle
+ * IPv4-mapped case.
+ */
+ if (family == AF_INET)
+ genhash = tcp_v4_md5_hash_skb(newhash,
+ hash_expected,
+ NULL, skb);
+ else
+ genhash = tp->af_specific->calc_md5_hash(newhash,
+ hash_expected,
+ NULL, skb);
+
+ if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
+ if (family == AF_INET) {
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
+ saddr, ntohs(th->source),
+ daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "", l3index);
+ } else {
+ net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
+ genhash ? "failed" : "mismatch",
+ saddr, ntohs(th->source),
+ daddr, ntohs(th->dest), l3index);
+ }
+ return SKB_DROP_REASON_TCP_MD5FAILURE;
+ }
+ return SKB_NOT_DROPPED_YET;
+}
+EXPORT_SYMBOL(tcp_inbound_md5_hash);
+
#endif
void tcp_done(struct sock *sk)
@@ -4422,16 +4655,24 @@ EXPORT_SYMBOL_GPL(tcp_done);
int tcp_abort(struct sock *sk, int err)
{
- if (!sk_fullsock(sk)) {
- if (sk->sk_state == TCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
+ int state = inet_sk_state_load(sk);
- local_bh_disable();
- inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- local_bh_enable();
- return 0;
- }
- return -EOPNOTSUPP;
+ if (state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ local_bh_enable();
+ return 0;
+ }
+ if (state == TCP_TIME_WAIT) {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+
+ refcount_inc(&tw->tw_refcnt);
+ local_bh_disable();
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
}
/* Don't race with userspace socket closes such as tcp_close. */
@@ -4507,7 +4748,6 @@ void __init tcp_init(void)
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
- inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/
0, 64 * 1024);
@@ -4517,6 +4757,12 @@ void __init tcp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT,
NULL);
+ tcp_hashinfo.bind2_bucket_cachep =
+ kmem_cache_create("tcp_bind2_bucket",
+ sizeof(struct inet_bind2_bucket), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT,
+ NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
@@ -4540,7 +4786,7 @@ void __init tcp_init(void)
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
- sizeof(struct inet_bind_hashbucket),
+ 2 * sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1,
17, /* one slot per 128 KB of memory */
0,
@@ -4549,11 +4795,15 @@ void __init tcp_init(void)
0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+ tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
+ tcp_hashinfo.pernet = false;
cnt = tcp_hashinfo.ehash_mask + 1;
sysctl_tcp_max_orphans = cnt / 2;
@@ -4564,11 +4814,11 @@ void __init tcp_init(void)
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);
- init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
- init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);