aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-06-17 10:39:56 -0700
committerDavid S. Miller <davem@davemloft.net>2019-06-17 10:39:56 -0700
commit4fddbf8a99ee5a65bdd31b3ebbf5a84b9395d496 (patch)
tree671cdce9ff92bdd0a99619f995bbcea9124735ad
parentlapb: fixed leak of control-blocks. (diff)
parenttcp: enforce tcp_min_snd_mss in tcp_mtu_probing() (diff)
downloadwireguard-linux-4fddbf8a99ee5a65bdd31b3ebbf5a84b9395d496.tar.xz
wireguard-linux-4fddbf8a99ee5a65bdd31b3ebbf5a84b9395d496.zip
Merge branch 'tcp-fixes'
Eric Dumazet says: ==================== tcp: make sack processing more robust Jonathan Looney brought to our attention multiple problems in TCP stack at the sender side. SACK processing can be abused by malicious peers to either cause overflows, or increase of memory usage. First two patches fix the immediate problems. Since the malicious peers abuse senders by advertizing a very small MSS in their SYN or SYNACK packet, the last two patches add a new sysctl so that admins can chose a higher limit for MSS clamping. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/linux/tcp.h4
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--include/net/tcp.h2
-rw-r--r--include/uapi/linux/snmp.h1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_input.c26
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c10
-rw-r--r--net/ipv4/tcp_timer.c1
12 files changed, 58 insertions, 9 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 288aa264ac26..22f6b8b1110a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -255,6 +255,14 @@ tcp_base_mss - INTEGER
Path MTU discovery (MTU probing). If MTU probing is enabled,
this is the initial MSS used by the connection.
+tcp_min_snd_mss - INTEGER
+ TCP SYN and SYNACK messages usually advertise an ADVMSS option,
+ as described in RFC 1122 and RFC 6691.
+ If this ADVMSS option is smaller than tcp_min_snd_mss,
+ it is silently capped to tcp_min_snd_mss.
+
+ Default : 48 (at least 8 bytes of payload per segment)
+
tcp_congestion_control - STRING
Set the congestion control algorithm to be used for new
connections. The algorithm "reno" is always available, but
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 711361af9ce0..9a478a0cd3a2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -484,4 +484,8 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
return (user_mss && user_mss < mss) ? user_mss : mss;
}
+
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
+ int shiftlen);
+
#endif /* _LINUX_TCP_H */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7698460a3dd1..623cfbb7b8dc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -117,6 +117,7 @@ struct netns_ipv4 {
#endif
int sysctl_tcp_mtu_probing;
int sysctl_tcp_base_mss;
+ int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ac2f53fbfa6b..582c0caa9811 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -51,6 +51,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_HEADER (128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
+#define TCP_MIN_SND_MSS 48
+#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
/*
* Never offer a window over 32767 without using window scaling. Some
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 86dc24a96c90..fd42c1316d3d 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -283,6 +283,7 @@ enum
LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
LINUX_MIB_TCPZEROWINDOWDROP, /* TCPZeroWindowDrop */
LINUX_MIB_TCPRCVQDROP, /* TCPRcvQDrop */
+ LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */
__LINUX_MIB_MAX
};
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4370f4246e86..073273b751f8 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -287,6 +287,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
+ SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fa213bd8e233..b6f14af926fa 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -39,6 +39,8 @@ static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
+static int tcp_min_snd_mss_max = 65535;
static int ip_privileged_port_min;
static int ip_privileged_port_max = 65535;
static int ip_ttl_min = 1;
@@ -770,6 +772,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "tcp_min_snd_mss",
+ .data = &init_net.ipv4.sysctl_tcp_min_snd_mss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
+ {
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f448a288d158..7dc9ab84bb69 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3873,6 +3873,7 @@ void __init tcp_init(void)
unsigned long limit;
unsigned int i;
+ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
FIELD_SIZEOF(struct sk_buff, cb));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38dfc308c0fb..d95ee40df6c2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1302,7 +1302,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
TCP_SKB_CB(skb)->seq += shifted;
tcp_skb_pcount_add(prev, pcount);
- BUG_ON(tcp_skb_pcount(skb) < pcount);
+ WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount);
/* When we're adding to gso_segs == 1, gso_size will be zero,
@@ -1368,6 +1368,21 @@ static int skb_can_shift(const struct sk_buff *skb)
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
+ int pcount, int shiftlen)
+{
+ /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
+ * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
+ * to make sure not storing more than 65535 * 8 bytes per skb,
+ * even if current MSS is bigger.
+ */
+ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
+ return 0;
+ if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
+ return 0;
+ return skb_shift(to, from, shiftlen);
+}
+
/* Try collapsing SACK blocks spanning across multiple skbs to a single
* skb.
*/
@@ -1473,7 +1488,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
goto fallback;
- if (!skb_shift(prev, skb, len))
+ if (!tcp_skb_shift(prev, skb, pcount, len))
goto fallback;
if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;
@@ -1491,11 +1506,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
goto out;
len = skb->len;
- if (skb_shift(prev, skb, len)) {
- pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+ pcount = tcp_skb_pcount(skb);
+ if (tcp_skb_shift(prev, skb, pcount, len))
+ tcp_shifted_skb(sk, prev, skb, state, pcount,
len, mss, 0);
- }
out:
return prev;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index bc86f9735f45..cfa81190a1b1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2628,6 +2628,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f429e856e263..00c01a01b547 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1296,6 +1296,11 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
if (nsize < 0)
nsize = 0;
+ if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
+ return -ENOMEM;
+ }
+
if (skb_unclone(skb, gfp))
return -ENOMEM;
@@ -1454,8 +1459,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
- if (mss_now < 48)
- mss_now = 48;
+ mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
return mss_now;
}
@@ -2747,7 +2751,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
if (next_skb_size <= skb_availroom(skb))
skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
next_skb_size);
- else if (!skb_shift(skb, next_skb, next_skb_size))
+ else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
return false;
}
tcp_highest_sack_replace(sk, next_skb, skb);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5bad937ce779..c801cd37cc2a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -155,6 +155,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
+ mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);