aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_hashtables.c184
-rw-r--r--net/ipv4/inet_timewait_sock.c27
-rw-r--r--net/ipv4/ip_gre.c172
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_fastopen.c30
-rw-r--r--net/ipv4/tcp_input.c31
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c9
-rw-r--r--net/ipv4/tcp_timer.c17
-rw-r--r--net/ipv4/udp.c44
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
13 files changed, 343 insertions, 190 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e7d15fb0d94d..f6f58108b4c5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <linux/bootmem.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
@@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
+static struct inet_listen_hashbucket *
+inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
+{
+ u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ hash = ipv6_portaddr_hash(sock_net(sk),
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ else
+#endif
+ hash = ipv4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ return inet_lhash2_bucket(h, hash);
+}
+
+static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
+{
+ struct inet_listen_hashbucket *ilb2;
+
+ if (!h->lhash2)
+ return;
+
+ ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+ spin_lock(&ilb2->lock);
+ if (sk->sk_reuseport && sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+ &ilb2->head);
+ else
+ hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
+ &ilb2->head);
+ ilb2->count++;
+ spin_unlock(&ilb2->lock);
+}
+
+static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+{
+ struct inet_listen_hashbucket *ilb2;
+
+ if (!h->lhash2 ||
+ WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
+ return;
+
+ ilb2 = inet_lhash2_bucket_sk(h, sk);
+
+ spin_lock(&ilb2->lock);
+ hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
+ ilb2->count--;
+ spin_unlock(&ilb2->lock);
+}
+
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif, const int sdif, bool exact_dif)
@@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net,
*/
/* called with rcu_read_lock() : No refcount taken on the socket */
+static struct sock *inet_lhash2_lookup(struct net *net,
+ struct inet_listen_hashbucket *ilb2,
+ struct sk_buff *skb, int doff,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif, const int sdif)
+{
+ bool exact_dif = inet_exact_dif_match(net, skb);
+ struct inet_connection_sock *icsk;
+ struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ u32 phash = 0;
+
+ inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
+ sk = (struct sock *)icsk;
+ score = compute_score(sk, net, hnum, daddr,
+ dif, sdif, exact_dif);
+ if (score > hiscore) {
+ if (sk->sk_reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
+ }
+ result = sk;
+ hiscore = score;
+ }
+ }
+
+ return result;
+}
+
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net,
{
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore = 0, matches = 0, reuseport = 0;
bool exact_dif = inet_exact_dif_match(net, skb);
+ struct inet_listen_hashbucket *ilb2;
struct sock *sk, *result = NULL;
+ int score, hiscore = 0;
+ unsigned int hash2;
u32 phash = 0;
+ if (ilb->count <= 10 || !hashinfo->lhash2)
+ goto port_lookup;
+
+ /* Too many sk in the ilb bucket (which is hashed by port alone).
+ * Try lhash2 (which is hashed by port and addr) instead.
+ */
+
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ if (result)
+ return result;
+
+ /* Lookup lhash2 with INADDR_ANY */
+
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ ilb2 = inet_lhash2_bucket(hashinfo, hash2);
+ if (ilb2->count > ilb->count)
+ goto port_lookup;
+
+ return inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+
+port_lookup:
sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr,
dif, sdif, exact_dif);
if (score > hiscore) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
return result;
- matches = 1;
}
result = sk;
hiscore = score;
- } else if (score == hiscore && reuseport) {
- matches++;
- if (reciprocal_scale(phash, matches) == 0)
- result = sk;
- phash = next_pseudo_random32(phash);
}
}
return result;
@@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk)
hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
else
hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ inet_hash2(hashinfo, sk);
+ ilb->count++;
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
@@ -509,28 +625,35 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_listen_hashbucket *ilb;
spinlock_t *lock;
bool listener = false;
- int done;
if (sk_unhashed(sk))
return;
if (sk->sk_state == TCP_LISTEN) {
- lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &ilb->lock;
listener = true;
} else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
}
spin_lock_bh(lock);
+ if (sk_unhashed(sk))
+ goto unlock;
+
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- if (listener)
- done = __sk_del_node_init(sk);
- else
- done = __sk_nulls_del_node_init_rcu(sk);
- if (done)
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ if (listener) {
+ inet_unhash2(hashinfo, sk);
+ __sk_del_node_init(sk);
+ ilb->count--;
+ } else {
+ __sk_nulls_del_node_init_rcu(sk);
+ }
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+unlock:
spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -665,10 +788,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ h->listening_hash[i].count = 0;
}
+
+ h->lhash2 = NULL;
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
+ unsigned long numentries, int scale,
+ unsigned long low_limit,
+ unsigned long high_limit)
+{
+ unsigned int i;
+
+ h->lhash2 = alloc_large_system_hash(name,
+ sizeof(*h->lhash2),
+ numentries,
+ scale,
+ 0,
+ NULL,
+ &h->lhash2_mask,
+ low_limit,
+ high_limit);
+
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_HEAD(&h->lhash2[i].head);
+ h->lhash2[i].count = 0;
+ }
+}
+
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
unsigned int locksz = sizeof(spinlock_t);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index b563e0c46bac..277ff69a312d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
*/
-void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
struct inet_hashinfo *hashinfo)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
- /*
- * Step 2: Hash TW into tcp ehash chain.
- * Notes :
- * - tw_refcnt is set to 4 because :
- * - We have one reference from bhash chain.
- * - We have one reference from ehash chain.
- * - We have one reference from timer.
- * - One reference for ourself (our caller will release it).
- * We can use atomic_set() because prior spin_lock()/spin_unlock()
- * committed into memory all tw fields.
- */
- refcount_set(&tw->tw_refcnt, 4);
inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
@@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock(lock);
+
+ /* tw_refcnt is set to 3 because we have :
+ * - one reference for bhash chain.
+ * - one reference for ehash chain.
+ * - one reference for timer.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
+ * Also note that after this point, we lost our implicit reference
+ * so we are not allowed to use tw anymore.
+ */
+ refcount_set(&tw->tw_refcnt, 3);
}
-EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
static void tw_timer_handler(struct timer_list *t)
{
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c1735632c8c..9a80d84fc182 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
static int ipgre_tunnel_init(struct net_device *dev);
static void erspan_build_header(struct sk_buff *skb,
- __be32 id, u32 index, bool truncate);
+ __be32 id, u32 index,
+ bool truncate, bool is_ipv4);
static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
@@ -255,34 +256,41 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
{
struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
+ struct erspan_base_hdr *ershdr;
+ struct erspan_metadata *pkt_md;
struct ip_tunnel_net *itn;
struct ip_tunnel *tunnel;
- struct erspanhdr *ershdr;
const struct iphdr *iph;
- __be32 index;
+ int ver;
int len;
itn = net_generic(net, erspan_net_id);
len = gre_hdr_len + sizeof(*ershdr);
+ /* Check based hdr len */
if (unlikely(!pskb_may_pull(skb, len)))
return PACKET_REJECT;
iph = ip_hdr(skb);
- ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+ ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+ ver = (ntohs(ershdr->ver_vlan) & VER_MASK) >> VER_OFFSET;
/* The original GRE header does not have key field,
* Use ERSPAN 10-bit session ID as key.
*/
tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
- index = ershdr->md.index;
+ pkt_md = (struct erspan_metadata *)(ershdr + 1);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
tpi->flags | TUNNEL_KEY,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ len = gre_hdr_len + erspan_hdr_len(ver);
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+
if (__iptunnel_pull_header(skb,
- gre_hdr_len + sizeof(*ershdr),
+ len,
htons(ETH_P_TEB),
false, false) < 0)
goto drop;
@@ -306,12 +314,27 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!md)
return PACKET_REJECT;
- md->index = index;
+ memcpy(md, pkt_md, sizeof(*md));
+ md->version = ver;
+
info = &tun_dst->u.tun_info;
info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
info->options_len = sizeof(*md);
} else {
- tunnel->index = ntohl(index);
+ tunnel->erspan_ver = ver;
+ if (ver == 1) {
+ tunnel->index = ntohl(pkt_md->u.index);
+ } else {
+ u16 md2_flags;
+ u16 dir, hwid;
+
+ md2_flags = ntohs(pkt_md->u.md2.flags);
+ dir = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+ hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+ tunnel->dir = dir;
+ tunnel->hwid = hwid;
+ }
+
}
skb_reset_mac_header(skb);
@@ -405,7 +428,8 @@ static int gre_rcv(struct sk_buff *skb)
if (hdr_len < 0)
goto drop;
- if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+ if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
+ tpi.proto == htons(ETH_P_ERSPAN2))) {
if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
}
@@ -560,6 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
bool truncate = false;
struct flowi4 fl;
int tunnel_hlen;
+ int version;
__be16 df;
tun_info = skb_tunnel_info(skb);
@@ -568,9 +593,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
goto err_free_skb;
key = &tun_info->key;
+ md = ip_tunnel_info_opts(tun_info);
+ if (!md)
+ goto err_free_rt;
/* ERSPAN has fixed 8 byte GRE header */
- tunnel_hlen = 8 + sizeof(struct erspanhdr);
+ version = md->version;
+ tunnel_hlen = 8 + erspan_hdr_len(version);
rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
if (!rt)
@@ -584,12 +613,23 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
truncate = true;
}
- md = ip_tunnel_info_opts(tun_info);
- if (!md)
- goto err_free_rt;
+ if (version == 1) {
+ erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+ ntohl(md->u.index), truncate, true);
+ } else if (version == 2) {
+ u16 md2_flags;
+ u8 direction;
+ u16 hwid;
- erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
- ntohl(md->index), truncate);
+ md2_flags = ntohs(md->u.md2.flags);
+ direction = (md2_flags & DIR_MASK) >> DIR_OFFSET;
+ hwid = (md2_flags & HWID_MASK) >> HWID_OFFSET;
+
+ erspan_build_header_v2(skb, tunnel_id_to_key32(key->tun_id),
+ direction, hwid, truncate, true);
+ } else {
+ goto err_free_rt;
+ }
gre_build_header(skb, 8, TUNNEL_SEQ,
htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
@@ -668,52 +708,6 @@ free_skb:
return NETDEV_TX_OK;
}
-static inline u8 tos_to_cos(u8 tos)
-{
- u8 dscp, cos;
-
- dscp = tos >> 2;
- cos = dscp >> 3;
- return cos;
-}
-
-static void erspan_build_header(struct sk_buff *skb,
- __be32 id, u32 index, bool truncate)
-{
- struct iphdr *iphdr = ip_hdr(skb);
- struct ethhdr *eth = eth_hdr(skb);
- enum erspan_encap_type enc_type;
- struct erspanhdr *ershdr;
- struct qtag_prefix {
- __be16 eth_type;
- __be16 tci;
- } *qp;
- u16 vlan_tci = 0;
-
- enc_type = ERSPAN_ENCAP_NOVLAN;
-
- /* If mirrored packet has vlan tag, extract tci and
- * perserve vlan header in the mirrored frame.
- */
- if (eth->h_proto == htons(ETH_P_8021Q)) {
- qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
- vlan_tci = ntohs(qp->tci);
- enc_type = ERSPAN_ENCAP_INFRAME;
- }
-
- skb_push(skb, sizeof(*ershdr));
- ershdr = (struct erspanhdr *)skb->data;
- memset(ershdr, 0, sizeof(*ershdr));
-
- ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
- (ERSPAN_VERSION << VER_OFFSET));
- ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
- ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
- (enc_type << EN_OFFSET & EN_MASK) |
- ((truncate << T_OFFSET) & T_MASK));
- ershdr->md.index = htonl(index & INDEX_MASK);
-}
-
static netdev_tx_t erspan_xmit(struct sk_buff *skb,
struct net_device *dev)
{
@@ -737,7 +731,14 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
}
/* Push ERSPAN header */
- erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+ if (tunnel->erspan_ver == 1)
+ erspan_build_header(skb, tunnel->parms.o_key, tunnel->index,
+ truncate, true);
+ else
+ erspan_build_header_v2(skb, tunnel->parms.o_key,
+ tunnel->dir, tunnel->hwid,
+ truncate, true);
+
tunnel->parms.o_flags &= ~TUNNEL_KEY;
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
return NETDEV_TX_OK;
@@ -1209,13 +1210,32 @@ static int ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_FWMARK])
*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
- if (data[IFLA_GRE_ERSPAN_INDEX]) {
- t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ if (data[IFLA_GRE_ERSPAN_VER]) {
+ t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
- if (t->index & ~INDEX_MASK)
+ if (t->erspan_ver != 1 && t->erspan_ver != 2)
return -EINVAL;
}
+ if (t->erspan_ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX]) {
+ t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ if (t->index & ~INDEX_MASK)
+ return -EINVAL;
+ }
+ } else if (t->erspan_ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR]) {
+ t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+ if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
+ return -EINVAL;
+ }
+ if (data[IFLA_GRE_ERSPAN_HWID]) {
+ t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+ if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -1282,7 +1302,7 @@ static int erspan_tunnel_init(struct net_device *dev)
tunnel->tun_hlen = 8;
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
- sizeof(struct erspanhdr);
+ erspan_hdr_len(tunnel->erspan_ver);
t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
@@ -1412,6 +1432,12 @@ static size_t ipgre_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_GRE_ERSPAN_INDEX */
nla_total_size(4) +
+ /* IFLA_GRE_ERSPAN_VER */
+ nla_total_size(1) +
+ /* IFLA_GRE_ERSPAN_DIR */
+ nla_total_size(1) +
+ /* IFLA_GRE_ERSPAN_HWID */
+ nla_total_size(2) +
0;
}
@@ -1454,9 +1480,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
- if (t->index)
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
+ goto nla_put_failure;
+
+ if (t->erspan_ver == 1) {
if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
goto nla_put_failure;
+ } else if (t->erspan_ver == 2) {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+ goto nla_put_failure;
+ }
return 0;
@@ -1492,6 +1527,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 },
[IFLA_GRE_FWMARK] = { .type = NLA_U32 },
[IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 },
+ [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 },
};
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 43b69af242e1..f0ed031f3594 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1106,7 +1106,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
new = true;
}
- __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+ __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f08eebe60446..c470fec9062f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3578,6 +3578,9 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
inet_hashinfo_init(&tcp_hashinfo);
+ inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+ thash_entries, 21, /* one slot per 2 MB*/
+ 0, 64 * 1024);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 78c192ee03a4..018a48477355 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -379,18 +379,9 @@ fastopen:
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
- unsigned long last_syn_loss = 0;
const struct dst_entry *dst;
- int syn_loss = 0;
- tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
-
- /* Recurring FO SYN losses: no cookie or data in SYN */
- if (syn_loss > 1 &&
- time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
- cookie->len = -1;
- return false;
- }
+ tcp_fastopen_cache_get(sk, mss, cookie);
/* Firewall blackhole issue check */
if (tcp_fastopen_active_should_disable(sk)) {
@@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
* following circumstances:
* 1. client side TFO socket receives out of order FIN
* 2. client side TFO socket receives out of order RST
+ * 3. client side TFO socket has timed out three times consecutively during
+ * or after handshake
* We disable active side TFO globally for 1hr at first. Then if it
* happens again, we disable it for 2h, then 4h, 8h, ...
* And we reset the timeout back to 1hr when we see a successful active
@@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
dst_release(dst);
}
}
+
+void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
+{
+ u32 timeouts = inet_csk(sk)->icsk_retransmits;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Broken middle-boxes may black-hole Fast Open connection during or
+ * even after the handshake. Be extremely conservative and pause
+ * Fast Open globally after hitting the third consecutive timeout or
+ * exceeding the configured timeout limit.
+ */
+ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
+ (timeouts == 2 || (timeouts < 2 && expired))) {
+ tcp_fastopen_active_disable(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
+}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 45f750e85714..4d55c4b338ee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -578,8 +578,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied;
int time;
- int copied;
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
@@ -602,38 +602,31 @@ void tcp_rcv_space_adjust(struct sock *sk)
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvwin, rcvmem, rcvbuf;
+ int rcvmem, rcvbuf;
+ u64 rcvwin, grow;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
- rcvwin = (copied << 1) + 16 * tp->advmss;
+ rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
- /* If rate increased by 25%,
- * assume slow start, rcvwin = 3 * copied
- * If rate increased by 50%,
- * assume sender can use 2x growth, rcvwin = 4 * copied
- */
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
- if (copied >=
- tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
- rcvwin <<= 1;
- else
- rcvwin += (rcvwin >> 1);
- }
+ /* Accommodate for sender rate increase (eg. slow start) */
+ grow = rcvwin * (copied - tp->rcvq_space.space);
+ do_div(grow, tp->rcvq_space.space);
+ rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ do_div(rcvwin, tp->advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem,
+ sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
sk->sk_rcvbuf = rcvbuf;
/* Make the window clamp follow along. */
- tp->window_clamp = rcvwin;
+ tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
}
}
tp->rcvq_space.space = copied;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7097f92d16e5..759e6bc8327b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
static DEFINE_SEQLOCK(fastopen_seqlock);
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
- struct tcp_fastopen_cookie *cookie,
- int *syn_loss, unsigned long *last_syn_loss)
+ struct tcp_fastopen_cookie *cookie)
{
struct tcp_metrics_block *tm;
@@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
*cookie = tfom->cookie;
if (cookie->len <= 0 && tfom->try_exp == 1)
cookie->exp = true;
- *syn_loss = tfom->syn_loss;
- *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
} while (read_seqretry(&fastopen_seqlock, seq));
}
rcu_read_unlock();
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b079b619b60c..a8384b0c11f8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
*/
local_bh_disable();
inet_twsk_schedule(tw, timeo);
- /* Linkage updates. */
- __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
- inet_twsk_put(tw);
+ /* Linkage updates.
+ * Note that access to tw after this point is illegal.
+ */
+ inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
local_bh_enable();
} else {
/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c7b506..04be9f833927 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2414,15 +2414,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
/* Schedule a loss probe in 2*RTT for SACK capable connections
- * in Open state, that are either limited by cwnd or application.
+ * not in loss recovery, that are either limited by cwnd or application.
*/
if ((early_retrans != 3 && early_retrans != 4) ||
!tp->packets_out || !tcp_is_sack(tp) ||
- icsk->icsk_ca_state != TCP_CA_Open)
- return false;
-
- if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
- !tcp_write_queue_empty(sk))
+ (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_CWR))
return false;
/* Probe timeout is 2*rtt. Add minimum RTO to account
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 968fda198376..6db3124cdbda 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -183,11 +183,6 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) {
dst_negative_advice(sk);
- if (tp->syn_fastopen || tp->syn_data)
- tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (tp->syn_data && icsk->icsk_retransmits == 1)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
} else if (!tp->syn_data && !tp->syn_fastopen) {
sk_rethink_txhash(sk);
}
@@ -195,17 +190,6 @@ static int tcp_write_timeout(struct sock *sk)
expired = icsk->icsk_retransmits >= retry_until;
} else {
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
- /* Some middle-boxes may black-hole Fast Open _after_
- * the handshake. Therefore we conservatively disable
- * Fast Open on this path on recurring timeouts after
- * successful Fast Open.
- */
- if (tp->syn_data_acked) {
- tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
- }
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -228,6 +212,7 @@ static int tcp_write_timeout(struct sock *sk)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
}
+ tcp_fastopen_active_detect_blackhole(sk, expired);
if (expired) {
/* Has it gone just too far? */
tcp_write_err(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e4ff25c947c5..e9c0d1e1772e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -357,18 +357,12 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
- unsigned int port)
-{
- return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
-}
-
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
- udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+ ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
unsigned int hash2_partial =
- udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+ ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
/* precompute partial secondary hash */
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
@@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
struct sk_buff *skb)
{
struct sock *sk, *result;
- int score, badness, matches = 0, reuseport = 0;
+ int score, badness;
u32 hash = 0;
result = NULL;
@@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (result)
return result;
- matches = 1;
}
badness = score;
result = sk;
- } else if (score == badness && reuseport) {
- matches++;
- if (reciprocal_scale(hash, matches) == 0)
- result = sk;
- hash = next_pseudo_random32(hash);
}
}
return result;
@@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
bool exact_dif = udp_lib_exact_dif_match(net, skb);
- int score, badness, matches = 0, reuseport = 0;
+ int score, badness;
u32 hash = 0;
if (hslot->count > 10) {
- hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
@@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
- hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
/* avoid searching the same slot again. */
if (unlikely(slot2 == old_slot2))
@@ -526,23 +513,16 @@ begin:
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
- reuseport = sk->sk_reuseport;
- if (reuseport) {
+ if (sk->sk_reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (result)
return result;
- matches = 1;
}
result = sk;
badness = score;
- } else if (score == badness && reuseport) {
- matches++;
- if (reciprocal_scale(hash, matches) == 0)
- result = sk;
- hash = next_pseudo_random32(hash);
}
}
return result;
@@ -1775,7 +1755,7 @@ EXPORT_SYMBOL(udp_lib_rehash);
static void udp_v4_rehash(struct sock *sk)
{
- u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
inet_sk(sk)->inet_rcv_saddr,
inet_sk(sk)->inet_num);
udp_lib_rehash(sk, new_hash);
@@ -1966,9 +1946,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct sk_buff *nskb;
if (use_hash2) {
- hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
udptable->mask;
- hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask;
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
hslot = &udptable->hash2[hash2];
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
@@ -2200,7 +2180,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index e6265e2c274e..7d885a44dc9d 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
- top_iph->ttl = ip4_dst_hoplimit(dst->child);
+ top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
top_iph->saddr = x->props.saddr.a4;
top_iph->daddr = x->id.daddr.a4;