aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/devinet.c4
-rw-r--r--net/ipv4/fib_frontend.c27
-rw-r--r--net/ipv4/fib_semantics.c161
-rw-r--r--net/ipv4/icmp.c27
-rw-r--r--net/ipv4/igmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c264
-rw-r--r--net/ipv4/inet_diag.c96
-rw-r--r--net/ipv4/inet_hashtables.c53
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c13
-rw-r--r--net/ipv4/ip_gre.c46
-rw-r--r--net/ipv4/ip_input.c10
-rw-r--r--net/ipv4/ip_output.c89
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipconfig.c32
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter.c7
-rw-r--r--net/ipv4/netfilter/Kconfig1
-rw-r--r--net/ipv4/netfilter/arp_tables.c12
-rw-r--r--net/ipv4/netfilter/ip_tables.c20
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c8
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c2
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c26
-rw-r--r--net/ipv4/netfilter/ipt_ah.c2
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c4
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c4
-rw-r--r--net/ipv4/netfilter/iptable_security.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c9
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c2
-rw-r--r--net/ipv4/raw.c10
-rw-r--r--net/ipv4/route.c108
-rw-r--r--net/ipv4/syncookies.c21
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp.c39
-rw-r--r--net/ipv4/tcp_dctcp.c2
-rw-r--r--net/ipv4/tcp_fastopen.c44
-rw-r--r--net/ipv4/tcp_input.c237
-rw-r--r--net/ipv4/tcp_ipv4.c191
-rw-r--r--net/ipv4/tcp_minisocks.c42
-rw-r--r--net/ipv4/tcp_output.c32
-rw-r--r--net/ipv4/tcp_recovery.c109
-rw-r--r--net/ipv4/tcp_timer.c6
-rw-r--r--net/ipv4/udp.c28
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_policy.c8
56 files changed, 1016 insertions, 867 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89aacb630a53..c29809f765dc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a556643b874..11c4ca13ec3b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -119,7 +119,7 @@
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
-#include <net/vrf.h>
+#include <net/l3mdev.h>
/* The inetsw table contains everything that inet_create needs to
@@ -219,17 +219,13 @@ int inet_listen(struct socket *sock, int backlog)
* shutdown() (rather than close()).
*/
if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
- !inet_csk(sk)->icsk_accept_queue.fastopenq) {
+ !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
- err = fastopen_init_queue(sk, backlog);
+ fastopen_queue_tune(sk, backlog);
else if ((sysctl_tcp_fastopen &
TFO_SERVER_WO_SOCKOPT2) != 0)
- err = fastopen_init_queue(sk,
+ fastopen_queue_tune(sk,
((uint)sysctl_tcp_fastopen) >> 16);
- else
- err = 0;
- if (err)
- goto out;
tcp_fastopen_init_key_once(true);
}
@@ -450,7 +446,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id;
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
/* Not specified by any standard per-se, however it breaks too
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 01308e6e6127..59b3e0e8fd51 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -312,7 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip,
if (!skb)
return;
- skb_dst_set(skb, dst);
+ skb_dst_set(skb, dst_clone(dst));
arp_xmit(skb);
}
@@ -384,7 +384,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
}
if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
- dst = dst_clone(skb_dst(skb));
+ dst = skb_dst(skb);
arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
dst_hw, dev->dev_addr, NULL, dst);
}
@@ -816,7 +816,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
} else {
pneigh_enqueue(&arp_tbl,
in_dev->arp_parms, skb);
- return 0;
+ goto out_free_dst;
}
goto out;
}
@@ -870,6 +870,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
out:
consume_skb(skb);
+out_free_dst:
+ dst_release(reply_dst);
return 0;
}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 735008472844..cebd9d31e65a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1644,7 +1644,8 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
-static size_t inet_get_link_af_size(const struct net_device *dev)
+static size_t inet_get_link_af_size(const struct net_device *dev,
+ u32 ext_filter_mask)
{
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
@@ -2398,4 +2399,3 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf, NULL);
}
-
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6fcbd215cdbc..e786873c89f2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -45,7 +45,7 @@
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
#include <trace/events/fib.h>
#ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -255,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
return __inet_dev_addr_type(net, dev, addr, rt_table);
}
@@ -268,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net,
const struct net_device *dev,
__be32 addr)
{
- u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+ u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
return __inet_dev_addr_type(net, NULL, addr, rt_table);
}
@@ -332,7 +332,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = vrf_master_ifindex_rcu(dev);
+ fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
if (!fl4.flowi4_iif)
fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
@@ -340,6 +340,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
+ fl4.flowi4_flags = 0;
no_addr = idev->ifa_list == NULL;
@@ -366,7 +367,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (nh->nh_dev == dev) {
dev_match = true;
break;
- } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+ } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
dev_match = true;
break;
}
@@ -803,7 +804,7 @@ out:
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
- u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev);
+ u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -866,9 +867,10 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
@@ -913,9 +915,10 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
} else if (!ipv4_is_zeronet(any) &&
(any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_DELROUTE,
- dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
subnet = 1;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 064bd3caaa4f..42778d9d71e5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,8 +57,7 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-static DEFINE_SPINLOCK(fib_multipath_lock);
+u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -532,7 +531,67 @@ errout:
return ret;
}
-#endif
+static void fib_rebalance(struct fib_info *fi)
+{
+ int total;
+ int w;
+ struct in_device *in_dev;
+
+ if (fi->fib_nhs < 2)
+ return;
+
+ total = 0;
+ for_nexthops(fi) {
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN)
+ continue;
+
+ total += nh->nh_weight;
+ } endfor_nexthops(fi);
+
+ w = 0;
+ change_nexthops(fi) {
+ int upper_bound;
+
+ in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
+
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+ upper_bound = -1;
+ } else if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+ upper_bound = -1;
+ } else {
+ w += nexthop_nh->nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+ } endfor_nexthops(fi);
+
+ net_get_random_once(&fib_multipath_secret,
+ sizeof(fib_multipath_secret));
+}
+
+static inline void fib_add_weight(struct fib_info *fi,
+ const struct fib_nh *nh)
+{
+ fi->fib_weight += nh->nh_weight;
+}
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define fib_rebalance(fi) do { } while (0)
+#define fib_add_weight(fi, nh) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
@@ -1094,8 +1153,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
+ fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
+ fib_rebalance(fi);
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -1317,12 +1379,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nexthop_nh->nh_power;
- nexthop_nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1345,6 +1401,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
}
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
@@ -1467,20 +1525,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
!__in_dev_get_rtnl(dev))
continue;
alive++;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- nexthop_nh->nh_power = 0;
nexthop_nh->nh_flags &= ~nh_flags;
- spin_unlock_bh(&fib_multipath_lock);
-#else
- nexthop_nh->nh_flags &= ~nh_flags;
-#endif
} endfor_nexthops(fi)
if (alive > 0) {
fi->fib_flags &= ~nh_flags;
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
@@ -1488,62 +1541,40 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
- struct in_device *in_dev;
- int w;
-
- spin_lock_bh(&fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
- if (nexthop_nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
- continue;
- power += nexthop_nh->nh_weight;
- nexthop_nh->nh_power = nexthop_nh->nh_weight;
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power <= 0) {
- spin_unlock_bh(&fib_multipath_lock);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- return;
- }
- }
-
- /* w should be random number [0..fi->fib_power-1],
- * it is pretty bad approximation.
- */
-
- w = jiffies % fi->fib_power;
+ for_nexthops(fi) {
+ if (hash > atomic_read(&nh->nh_upper_bound))
+ continue;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
- nexthop_nh->nh_power) {
- w -= nexthop_nh->nh_power;
- if (w <= 0) {
- nexthop_nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&fib_multipath_lock);
- return;
- }
- }
+ res->nh_sel = nhsel;
+ return;
} endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
- spin_unlock_bh(&fib_multipath_lock);
}
#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+ struct flowi4 *fl4, int mp_hash)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (mp_hash < 0)
+ mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr);
+ fib_select_multipath(res, mp_hash);
+ }
+ else
+#endif
+ if (!res->prefixlen &&
+ res->table->tb_num_default > 1 &&
+ res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(fl4, res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL_GPL(fib_select_path);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e5eb8ac4089d..36e26977c908 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -96,7 +96,7 @@
#include <net/xfrm.h>
#include <net/inet_common.h>
#include <net/ip_fib.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
/*
* Build xmit assembly blocks
@@ -309,7 +309,7 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
rc = false;
if (icmp_global_allow()) {
- int vif = vrf_master_ifindex(dst->dev);
+ int vif = l3mdev_master_ifindex(dst->dev);
struct inet_peer *peer;
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
@@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
fl4.flowi4_proto = IPPROTO_ICMP;
- fl4.flowi4_oif = vrf_master_ifindex(skb->dev);
+ fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
@@ -440,6 +440,22 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Source and destination is swapped. See ip_multipath_icmp_hash */
+static int icmp_multipath_hash_skb(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return fib_multipath_hash(iph->daddr, iph->saddr);
+}
+
+#else
+
+#define icmp_multipath_hash_skb(skb) (-1)
+
+#endif
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -461,10 +477,11 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
- fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev);
+ fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key(net, fl4);
+ rt = __ip_route_output_key_hash(net, fl4,
+ icmp_multipath_hash_skb(skb_in));
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d38b8b61eaee..64aaf3522a59 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(skb);
+ return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
static void igmp_gq_timer_expire(unsigned long data)
@@ -2569,7 +2569,7 @@ void ip_mc_drop_socket(struct sock *sk)
}
/* called with rcu_read_lock() */
-int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
struct ip_mc_list __rcu **mc_hash;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ba2f90d90cb5..1feb15f23de8 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -330,14 +330,12 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error)
goto out_err;
}
- req = reqsk_queue_remove(queue);
+ req = reqsk_queue_remove(queue, sk);
newsk = req->sk;
- sk_acceptq_removed(sk);
if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_rsk(req)->tfo_listener &&
- queue->fastopenq) {
- spin_lock_bh(&queue->fastopenq->lock);
+ tcp_rsk(req)->tfo_listener) {
+ spin_lock_bh(&queue->fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
@@ -348,7 +346,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
req->sk = NULL;
req = NULL;
}
- spin_unlock_bh(&queue->fastopenq->lock);
+ spin_unlock_bh(&queue->fastopenq.lock);
}
out:
release_sock(sk);
@@ -439,7 +437,7 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
-struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
const struct request_sock *req)
{
@@ -478,65 +476,12 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u32 synq_hsize)
-{
- return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
-}
-
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) true
#endif
-/* Note: this is temporary :
- * req sock will no longer be in listener hash table
-*/
-struct request_sock *inet_csk_search_req(struct sock *sk,
- const __be16 rport,
- const __be32 raddr,
- const __be32 laddr)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req;
- u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries);
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
- for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- if (ireq->ir_rmt_port == rport &&
- ireq->ir_rmt_addr == raddr &&
- ireq->ir_loc_addr == laddr &&
- AF_INET_FAMILY(req->rsk_ops->family)) {
- atomic_inc(&req->rsk_refcnt);
- WARN_ON(req->sk);
- break;
- }
- }
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return req;
-}
-EXPORT_SYMBOL_GPL(inet_csk_search_req);
-
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
- inet_rsk(req)->ir_rmt_port,
- lopt->hash_rnd, lopt->nr_table_entries);
-
- reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
- inet_csk_reqsk_queue_added(sk, timeout);
-}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
-
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
@@ -573,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
-/* return true if req was found in the syn_table[] */
+/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req)
{
- struct listen_sock *lopt = queue->listen_opt;
- struct request_sock **prev;
+ struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
- spin_lock(&queue->syn_wait_lock);
+ if (sk_hashed(req_to_sk(req))) {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
- for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
- prev = &(*prev)->dl_next) {
- if (*prev == req) {
- *prev = req->dl_next;
- found = true;
- break;
- }
+ spin_lock(lock);
+ found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+ spin_unlock(lock);
}
-
- spin_unlock(&queue->syn_wait_lock);
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req);
return found;
@@ -607,21 +546,25 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_put(req);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+
static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct listen_sock *lopt = queue->listen_opt;
int qlen, expire = 0, resend = 0;
int max_retries, thresh;
u8 defer_accept;
- if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
- reqsk_put(req);
- return;
- }
+ if (sk_listener->sk_state != TCP_LISTEN)
+ goto drop;
max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
thresh = max_retries;
@@ -642,9 +585,9 @@ static void reqsk_timer_handler(unsigned long data)
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- qlen = listen_sock_qlen(lopt);
- if (qlen >> (lopt->max_qlen_log - 1)) {
- int young = listen_sock_young(lopt) << 1;
+ qlen = reqsk_queue_len(queue);
+ if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
if (qlen < young)
@@ -666,41 +609,40 @@ static void reqsk_timer_handler(unsigned long data)
unsigned long timeo;
if (req->num_timeout++ == 0)
- atomic_inc(&lopt->young_dec);
+ atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
return;
}
- inet_csk_reqsk_queue_drop(sk_listener, req);
- reqsk_put(req);
+drop:
+ inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
}
-void reqsk_queue_hash_req(struct request_sock_queue *queue,
- u32 hash, struct request_sock *req,
- unsigned long timeout)
+static void reqsk_queue_hash_req(struct request_sock *req,
+ unsigned long timeout)
{
- struct listen_sock *lopt = queue->listen_opt;
-
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
- req->rsk_hash = hash;
+ inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
- atomic_set(&req->rsk_refcnt, 2);
+ atomic_set(&req->rsk_refcnt, 2 + 1);
+}
- spin_lock(&queue->syn_wait_lock);
- req->dl_next = lopt->syn_table[hash];
- lopt->syn_table[hash] = req;
- spin_unlock(&queue->syn_wait_lock);
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ unsigned long timeout)
+{
+ reqsk_queue_hash_req(req, timeout);
+ inet_csk_reqsk_queue_added(sk);
}
-EXPORT_SYMBOL(reqsk_queue_hash_req);
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -791,16 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+int inet_csk_listen_start(struct sock *sk, int backlog)
{
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+ struct inet_sock *inet = inet_sk(sk);
- if (rc != 0)
- return rc;
+ reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = 0;
+ sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -820,11 +760,76 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
}
sk->sk_state = TCP_CLOSE;
- __reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+ reqsk_put(req);
+}
+
+void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+ spin_lock(&queue->rskq_lock);
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_child_forget(sk, req, child);
+ } else {
+ req->sk = child;
+ req->dl_next = NULL;
+ if (queue->rskq_accept_head == NULL)
+ queue->rskq_accept_head = req;
+ else
+ queue->rskq_accept_tail->dl_next = req;
+ queue->rskq_accept_tail = req;
+ sk_acceptq_added(sk);
+ }
+ spin_unlock(&queue->rskq_lock);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+ struct request_sock *req, bool own_req)
+{
+ if (own_req) {
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ /* Warning: caller must not call reqsk_put(req);
+ * child stole last reference on it.
+ */
+ return child;
+ }
+ /* Too bad, another child took ownership of the request, undo. */
+ bh_unlock_sock(child);
+ sock_put(child);
+ return NULL;
+}
+EXPORT_SYMBOL(inet_csk_complete_hashdance);
+
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted.
@@ -833,11 +838,7 @@ void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct request_sock *acc_req;
- struct request_sock *req;
-
- /* make all the listen_opt local to us */
- acc_req = reqsk_queue_yank_acceptq(queue);
+ struct request_sock *next, *req;
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
@@ -847,57 +848,34 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
- reqsk_queue_destroy(queue);
-
- while ((req = acc_req) != NULL) {
+ while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
struct sock *child = req->sk;
- acc_req = req->dl_next;
-
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
- sk->sk_prot->disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- percpu_counter_inc(sk->sk_prot->orphan_count);
-
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != req->rsk_listener);
-
- /* Paranoid, to prevent race condition if
- * an inbound pkt destined for child is
- * blocked by sock lock in tcp_v4_rcv().
- * Also to satisfy an assertion in
- * tcp_v4_destroy_sock().
- */
- tcp_sk(child)->fastopen_rsk = NULL;
- }
- inet_csk_destroy_sock(child);
-
+ inet_child_forget(sk, req, child);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
- sk_acceptq_removed(sk);
- reqsk_put(req);
+ cond_resched();
}
- if (queue->fastopenq) {
+ if (queue->fastopenq.rskq_rst_head) {
/* Free all the reqs queued in rskq_rst_head. */
- spin_lock_bh(&queue->fastopenq->lock);
- acc_req = queue->fastopenq->rskq_rst_head;
- queue->fastopenq->rskq_rst_head = NULL;
- spin_unlock_bh(&queue->fastopenq->lock);
- while ((req = acc_req) != NULL) {
- acc_req = req->dl_next;
+ spin_lock_bh(&queue->fastopenq.lock);
+ req = queue->fastopenq.rskq_rst_head;
+ queue->fastopenq.rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq.lock);
+ while (req != NULL) {
+ next = req->dl_next;
reqsk_put(req);
+ req = next;
}
}
- WARN_ON(sk->sk_ack_backlog);
+ WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c3b1f3a0f4cf..ab9f8a66615d 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -730,91 +730,21 @@ static void twsk_build_assert(void)
#endif
}
-static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_diag_entry entry;
- int j, s_j, reqnum, s_reqnum;
- struct listen_sock *lopt;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !listen_sock_qlen(lopt))
- goto out;
-
- if (bc) {
- entry.sport = inet->inet_num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < lopt->nr_table_entries; j++) {
- struct request_sock *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- if (reqnum < s_reqnum)
- continue;
- if (r->id.idiag_dport != ireq->ir_rmt_port &&
- r->id.idiag_dport)
- continue;
-
- if (bc) {
- /* Note: entry.sport and entry.userlocks are already set */
- entry_fill_addrs(&entry, req_to_sk(req));
- entry.dport = ntohs(ireq->ir_rmt_port);
-
- if (!inet_diag_bc_run(bc, &entry))
- continue;
- }
-
- err = inet_req_diag_fill(req_to_sk(req), skb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, cb->nlh);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
-
- return err;
-}
-
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
+ u32 idiag_states = r->idiag_states;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
- if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & TCPF_LISTEN))
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
@@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (!(r->idiag_states & TCPF_LISTEN) ||
- r->id.idiag_dport ||
+ if (r->id.idiag_dport ||
cb->args[3] > 0)
- goto syn_recv;
-
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
- spin_unlock_bh(&ilb->lock);
- goto done;
- }
-
-syn_recv:
- if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
- if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -879,7 +799,7 @@ skip_listen_ht:
s_i = num = s_num = 0;
}
- if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ if (!(idiag_states & ~TCPF_LISTEN))
goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
@@ -906,7 +826,7 @@ skip_listen_ht:
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state;
- if (!(r->idiag_states & (1 << state)))
+ if (!(idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 89120196a949..ccc5980797fc 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -126,7 +126,7 @@ void inet_put_port(struct sock *sk)
}
EXPORT_SYMBOL(inet_put_port);
-int __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
unsigned short port = inet_sk(child)->inet_num;
@@ -137,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (unlikely(!tb)) {
+ spin_unlock(&head->lock);
+ return -ENOENT;
+ }
if (tb->port != port) {
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
@@ -185,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
}
return score;
}
@@ -398,14 +404,18 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
-void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
+/* insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
+ bool ret = true;
- WARN_ON(!sk_unhashed(sk));
+ WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
@@ -413,24 +423,41 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock(lock);
- __sk_nulls_add_node_rcu(sk, list);
if (osk) {
- WARN_ON(sk->sk_hash != osk->sk_hash);
- sk_nulls_del_node_init_rcu(osk);
+ WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+ ret = sk_nulls_del_node_init_rcu(osk);
}
+ if (ret)
+ __sk_nulls_add_node_rcu(sk, list);
spin_unlock(lock);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return ret;
+}
+
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+{
+ bool ok = inet_ehash_insert(sk, osk);
+
+ if (ok) {
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ } else {
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ sk->sk_state = TCP_CLOSE;
+ sock_set_flag(sk, SOCK_DEAD);
+ inet_csk_destroy_sock(sk);
+ }
+ return ok;
}
-EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
void __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
- if (sk->sk_state != TCP_LISTEN)
- return __inet_hash_nolisten(sk, osk);
-
+ if (sk->sk_state != TCP_LISTEN) {
+ inet_ehash_nolisten(sk, osk);
+ return;
+ }
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -551,7 +578,7 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- __inet_hash_nolisten(sk, (struct sock *)tw);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
@@ -568,7 +595,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __inet_hash_nolisten(sk, NULL);
+ inet_ehash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index d66cfb35ba74..da0d7ce85844 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -72,7 +72,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
int ip_forward(struct sk_buff *skb)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fa7f15305f9a..5482745d5d68 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -48,7 +48,7 @@
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
#include <net/inet_ecn.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -78,7 +78,7 @@ struct ipq {
u8 ecn; /* RFC3168 support */
u16 max_df_size; /* largest frag with DF set seen */
int iif;
- int vif; /* VRF device index */
+ int vif; /* L3 master device index */
unsigned int rid;
struct inet_peer *peer;
};
@@ -654,11 +654,10 @@ out_fail:
}
/* Process an incoming IP datagram fragment. */
-int ip_defrag(struct sk_buff *skb, u32 user)
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
- int vif = vrf_master_ifindex_rcu(dev);
- struct net *net = dev_net(dev);
+ int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
@@ -683,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
-struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct iphdr iph;
int netoff;
@@ -712,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
if (pskb_trim_rcsum(skb, netoff + len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- if (ip_defrag(skb, user))
+ if (ip_defrag(net, skb, user))
return NULL;
skb_clear_hash(skb);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index bd0679d90519..614521437e30 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -498,10 +498,26 @@ static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
+static struct rtable *gre_get_rt(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi4 *fl,
+ const struct ip_tunnel_key *key)
+{
+ struct net *net = dev_net(dev);
+
+ memset(fl, 0, sizeof(*fl));
+ fl->daddr = key->u.ipv4.dst;
+ fl->saddr = key->u.ipv4.src;
+ fl->flowi4_tos = RT_TOS(key->tos);
+ fl->flowi4_mark = skb->mark;
+ fl->flowi4_proto = IPPROTO_GRE;
+
+ return ip_route_output_key(net, fl);
+}
+
static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel_info *tun_info;
- struct net *net = dev_net(dev);
const struct ip_tunnel_key *key;
struct flowi4 fl;
struct rtable *rt;
@@ -516,14 +532,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
key = &tun_info->key;
- memset(&fl, 0, sizeof(fl));
- fl.daddr = key->u.ipv4.dst;
- fl.saddr = key->u.ipv4.src;
- fl.flowi4_tos = RT_TOS(key->tos);
- fl.flowi4_mark = skb->mark;
- fl.flowi4_proto = IPPROTO_GRE;
-
- rt = ip_route_output_key(net, &fl);
+ rt = gre_get_rt(skb, dev, &fl, key);
if (IS_ERR(rt))
goto err_free_skb;
@@ -566,6 +575,24 @@ err_free_skb:
dev->stats.tx_dropped++;
}
+static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+
+ rt = gre_get_rt(skb, dev, &fl4, &info->key);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ ip_rt_put(rt);
+ info->key.u.ipv4.src = fl4.saddr;
+ return 0;
+}
+
static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct net_device *dev)
{
@@ -1023,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_change_mtu = ip_tunnel_change_mtu,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
static void ipgre_tap_setup(struct net_device *dev)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7cc9f7bb7fb7..b1209b63381f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
@@ -167,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- net_eq(sock_net(sk), dev_net(dev))) {
+ net_eq(sock_net(sk), net)) {
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
}
if (last) {
@@ -246,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb)
/*
* Reassemble IP fragments.
*/
+ struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 06d2c87ed505..50e29737b584 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -83,9 +83,10 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
-static int ip_fragment(struct sock *sk, struct sk_buff *skb,
- unsigned int mtu,
- int (*output)(struct sock *, struct sk_buff *));
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
@@ -95,34 +96,28 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
- dst_output_okfn);
-}
-
-int __ip_local_out(struct sk_buff *skb)
-{
- return __ip_local_out_sk(skb->sk, skb);
+ dst_output);
}
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip_local_out(skb);
+ err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -142,6 +137,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
@@ -160,7 +156,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph->id = 0;
} else {
iph->frag_off = 0;
- __ip_select_ident(sock_net(sk), iph, 1);
+ __ip_select_ident(net, iph, 1);
}
if (opt && opt->opt.optlen) {
@@ -172,16 +168,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
skb->mark = sk->sk_mark;
/* Send it out. */
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
- struct net *net = dev_net(dev);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;
@@ -225,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
- unsigned int mtu)
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -235,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
skb_gso_network_seglen(skb) <= mtu)
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -258,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, mtu, ip_finish_output2);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -276,24 +271,23 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb, mtu);
+ return ip_finish_output_gso(net, sk, skb, mtu);
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
- return ip_fragment(sk, skb, mtu, ip_finish_output2);
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
}
-int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
- struct net *net = dev_net(dev);
/*
* If the indicated interface is up and running, send the packet.
@@ -352,10 +346,9 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sock *sk, struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
- struct net *net = dev_net(dev);
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
@@ -386,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -416,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
@@ -453,20 +447,20 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(sock_net(sk), skb, sk,
+ ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- res = ip_local_out(skb);
+ res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
@@ -495,20 +489,18 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
-static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
- int (*output)(struct sock *, struct sk_buff *))
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph = ip_hdr(skb);
if ((iph->frag_off & htons(IP_DF)) == 0)
- return ip_do_fragment(sk, skb, output);
+ return ip_do_fragment(net, sk, skb, output);
if (unlikely(!skb->ignore_df ||
(IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > mtu))) {
- struct net *net = dev_net(skb_rtable(skb)->dst.dev);
-
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
@@ -516,7 +508,7 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb,
return -EMSGSIZE;
}
- return ip_do_fragment(sk, skb, output);
+ return ip_do_fragment(net, sk, skb, output);
}
/*
@@ -526,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb,
* single device frame, and queue such a frame for sending.
*/
-int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -537,11 +529,9 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
- struct net *net;
int err = 0;
dev = rt->dst.dev;
- net = dev_net(dev);
/*
* Point into the IP datagram header.
@@ -631,7 +621,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
ip_send_check(iph);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
@@ -771,7 +761,7 @@ slow_path:
ip_send_check(iph);
- err = output(sk, skb2);
+ err = output(net, sk, skb2);
if (err)
goto fail;
@@ -1444,7 +1434,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
- err = ip_local_out(skb);
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
@@ -1571,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
}
oif = arg->bound_dev_if;
- if (!oif && netif_index_is_vrf(net, skb->skb_iif))
+ if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
oif = skb->skb_iif;
flowi4_init_output(&fl4, oif,
@@ -1606,7 +1596,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
ip_push_pending_frames(sk, &fl4);
}
out:
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 84dce6a92f93..6cb9009c3d96 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
+ struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
int err;
@@ -76,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(dev_net(rt->dst.dev), iph,
- skb_shinfo(skb)->gso_segs ?: 1);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
- err = ip_local_out_sk(sk, skb);
+ err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3b87ec5178f9..4d8f0b698777 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
- err = dst_output(skb->sk, skb);
+ err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index ed4ef09c2136..0bc7412d9e14 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -146,6 +146,10 @@ u8 root_server_path[256] = { 0, }; /* Path to mount as root */
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
+#if defined(CONFIG_IP_PNP_DHCP)
+static char dhcp_client_identifier[253] __initdata;
+#endif
+
/* Persistent data: */
static int ic_proto_used; /* Protocol used, if any */
@@ -728,6 +732,16 @@ ic_dhcp_init_options(u8 *options)
memcpy(e, vendor_class_identifier, len);
e += len;
}
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
+ }
}
*e++ = 255; /* End of the list */
@@ -1557,8 +1571,24 @@ static int __init ic_proto_name(char *name)
return 0;
}
#ifdef CONFIG_IP_PNP_DHCP
- else if (!strcmp(name, "dhcp")) {
+ else if (!strncmp(name, "dhcp", 4)) {
+ char *client_id;
+
ic_proto_enabled &= ~IC_RARP;
+ client_id = strstr(name, "dhcp,");
+ if (client_id) {
+ char *v;
+
+ client_id = client_id + 5;
+ v = strchr(client_id, ',');
+ if (!v)
+ return 1;
+ *v = 0;
+ if (kstrtou8(client_id, 0, dhcp_client_identifier))
+ DBG("DHCP: Invalid client identifier type\n");
+ strncpy(dhcp_client_identifier + 1, v + 1, 251);
+ *v = ',';
+ }
return 1;
}
#endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index cfcb996ec51b..fc42525d8694 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1689,7 +1689,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
/*
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 61eafc9b4545..c3776ff6749f 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,9 +17,8 @@
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
@@ -104,7 +103,7 @@ static void nf_ip_saveroute(const struct sk_buff *skb,
}
}
-static int nf_ip_reroute(struct sk_buff *skb,
+static int nf_ip_reroute(struct net *net, struct sk_buff *skb,
const struct nf_queue_entry *entry)
{
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
@@ -116,7 +115,7 @@ static int nf_ip_reroute(struct sk_buff *skb,
skb->mark == rt_info->mark &&
iph->daddr == rt_info->daddr &&
iph->saddr == rt_info->saddr))
- return ip_route_me_harder(skb, RTN_UNSPEC);
+ return ip_route_me_harder(net, skb, RTN_UNSPEC);
}
return 0;
}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 690d27d3f2f9..a35584176535 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -75,6 +75,7 @@ endif # NF_TABLES
config NF_DUP_IPV4
tristate "Netfilter IPv4 packet duplication to alternate destination"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
help
This option enables the nf_dup_ipv4 core, which duplicates an IPv4
packet to be rerouted to another destination.
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 2dad3e1c5f11..11dccba474b7 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -186,7 +186,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, arpinfo->iniface,
- arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : "");
return 0;
}
@@ -195,7 +195,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, arpinfo->outiface,
- arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : "");
return 0;
}
@@ -468,7 +468,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -632,7 +632,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
* newinfo).
*/
static int translate_table(struct xt_table_info *newinfo, void *entry0,
- const struct arpt_replace *repl)
+ const struct arpt_replace *repl)
{
struct arpt_entry *iter;
unsigned int i;
@@ -892,7 +892,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1069,7 +1069,7 @@ static int __do_replace(struct net *net, const char *name,
}
static int do_replace(struct net *net, const void __user *user,
- unsigned int len)
+ unsigned int len)
{
int ret;
struct arpt_replace tmp;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 42d0946956db..b99affad6ba1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -102,7 +102,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
dprintf("VIA in mismatch (%s vs %s).%s\n",
indev, ipinfo->iniface,
- ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : "");
return false;
}
@@ -111,7 +111,7 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
dprintf("VIA out mismatch (%s vs %s).%s\n",
outdev, ipinfo->outiface,
- ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : "");
return false;
}
@@ -120,7 +120,7 @@ ip_packet_match(const struct iphdr *ip,
FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
dprintf("Packet protocol %hi does not match %hi.%s\n",
ip->protocol, ipinfo->proto,
- ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : "");
return false;
}
@@ -431,8 +431,8 @@ ipt_do_table(struct sk_buff *skb,
} while (!acpar.hotdrop);
pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
- xt_write_recseq_end(addend);
- local_bh_enable();
+ xt_write_recseq_end(addend);
+ local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -484,7 +484,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- XT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -549,7 +549,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
- next:
+next:
duprintf("Finished chain %u\n", hook);
}
return 1;
@@ -804,7 +804,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
- const struct ipt_replace *repl)
+ const struct ipt_replace *repl)
{
struct ipt_entry *iter;
unsigned int i;
@@ -1078,7 +1078,7 @@ static int compat_table_info(const struct xt_table_info *info,
#endif
static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+ const int *len, int compat)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1304,7 +1304,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
static int
do_add_counters(struct net *net, const void __user *user,
- unsigned int len, int compat)
+ unsigned int len, int compat)
{
unsigned int i;
struct xt_counters_info tmp;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3f32c03e8b2e..4a9e6db9df8d 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -492,14 +492,14 @@ static void arp_print(struct arp_payload *payload)
{
#define HBUFFERLEN 30
char hbuffer[HBUFFERLEN];
- int j,k;
+ int j, k;
- for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) {
hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
- hbuffer[k++]=':';
+ hbuffer[k++] = ':';
}
- hbuffer[--k]='\0';
+ hbuffer[--k] = '\0';
pr_debug("src %pI4@%s, dst %pI4\n",
&payload->src_ip, hbuffer, &payload->dst_ip);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 87907d4bd259..1d16c0f28df0 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(skb, hook);
+ nf_send_reset(par->net, skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index d7021f28c3f0..5fdc556514ba 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -39,11 +39,14 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
}
static void
-synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+synproxy_send_tcp(const struct synproxy_net *snet,
+ const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
+ struct net *net = nf_ct_net(snet->tmpl);
+
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
@@ -51,7 +54,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -60,7 +63,7 @@ synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
nf_conntrack_get(nfct);
}
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
free_nskb:
@@ -68,7 +71,8 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+synproxy_send_client_synack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
struct sk_buff *nskb;
@@ -104,7 +108,7 @@ synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
@@ -148,7 +152,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
@@ -188,7 +192,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
@@ -226,8 +230,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
- niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
}
static bool
@@ -287,7 +291,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(skb, th, &opts);
+ synproxy_send_client_synack(snet, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
@@ -433,14 +437,12 @@ static struct xt_target synproxy_tg4_reg __read_mostly = {
static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
},
{
.hook = ipv4_synproxy_hook,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 14a2aa8b8a14..a787d07f6cb7 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -25,7 +25,7 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
bool r;
pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
invert ? '!' : ' ', min, spi, max);
- r=(spi >= min && spi <= max) ^ invert;
+ r = (spi >= min && spi <= max) ^ invert;
pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 74dd6671b66d..78cc64eddfc1 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -60,9 +60,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
if (FIB_RES_DEV(res) == dev)
dev_match = true;
#endif
- if (dev_match || flags & XT_RPFILTER_LOOSE)
- return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
- return dev_match;
+ return dev_match || flags & XT_RPFILTER_LOOSE;
}
static bool rpfilter_is_local(const struct sk_buff *skb)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 2d6fc911866f..ba5d392a13c4 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -67,7 +67,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 3a2e4d830a0b..ae2cd2752046 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -68,7 +68,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
@@ -76,7 +75,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_out,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
@@ -84,7 +82,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* Before packet filtering, change destination */
{
.hook = iptable_nat_ipv4_local_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
@@ -92,7 +89,6 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
/* After packet filtering, change source */
{
.hook = iptable_nat_ipv4_fn,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index f534e2f05bad..c2e23d5e9cd4 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -79,7 +79,7 @@ static int __init iptable_security_init(void)
int ret;
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0)
return ret;
sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 752fb40adcf8..461ca926fd39 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -166,42 +166,36 @@ static unsigned int ipv4_conntrack_local(void *priv,
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
.hook = ipv4_conntrack_in,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_helper,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_HELPER,
},
{
.hook = ipv4_confirm,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b246346ee849..0e5591c2ee9f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,14 +22,15 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+ u_int32_t user)
{
int err;
skb_orphan(skb);
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(net, skb, user);
local_bh_enable();
if (!err) {
@@ -85,7 +86,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
enum ip_defrag_users user =
nf_ct_defrag_user(state->hook, skb);
- if (nf_ct_ipv4_gather_frags(skb, user))
+ if (nf_ct_ipv4_gather_frags(state->net, skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
@@ -94,14 +95,12 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
static struct nf_hook_ops ipv4_defrag_ops[] = {
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
},
{
.hook = ipv4_conntrack_defrag,
- .owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ce2a59e5c665..ceb187308120 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -92,7 +92,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
- ip_local_out(skb);
+ ip_local_out(net, skb->sk, skb);
__this_cpu_write(nf_skb_duplicated, false);
} else {
kfree_skb(skb);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index bc3b9dcbf080..5075b7ecd26d 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -431,7 +431,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 7c676671329d..ddb894ac1458 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1156,7 +1156,7 @@ static int snmp_parse_mangle(unsigned char *msg,
}
if (obj->type == SNMP_IPADDR)
- mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+ mangle_address(ctx.begin, ctx.pointer - 4, map, check);
kfree(obj->id);
kfree(obj);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 3262e41ff76f..c747b2d9eb77 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -99,7 +99,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
/* Send RST reply */
-void nf_send_reset(struct sk_buff *oldskb, int hook)
+void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
const struct iphdr *oiph;
@@ -129,7 +129,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
goto free_nskb;
/* "Never happens" */
@@ -157,7 +157,7 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 9f486b302108..2375b0a8be46 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -53,7 +53,7 @@ static unsigned int nf_route_table_hook(void *priv,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos)
- if (ip_route_me_harder(skb, RTN_UNSPEC))
+ if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
ret = NF_DROP;
}
return ret;
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index c1582e03b628..c24f41c816b3 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -30,7 +30,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
nf_send_unreach(pkt->skb, priv->icmp_code, pkt->hook);
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(pkt->skb, pkt->hook);
+ nf_send_reset(pkt->net, pkt->skb, pkt->hook);
break;
default:
break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 28ef8a913130..8c0d0bdc2a7c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -413,7 +413,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, rt->dst.dev,
- dst_output_okfn);
+ dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -484,6 +484,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
@@ -543,7 +544,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ err = ip_cmsg_send(net, msg, &ipc, false);
if (err)
goto out;
if (ipc.opt)
@@ -598,6 +599,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
+ if (!saddr && ipc.oif)
+ l3mdev_get_saddr(net, ipc.oif, &fl4);
+
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -608,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6bab84503cd9..85f184e429c6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,7 +112,7 @@
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -847,7 +847,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
return;
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
- vif = vrf_master_ifindex_rcu(rt->dst.dev);
+ vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
rcu_read_unlock();
net = dev_net(rt->dst.dev);
@@ -941,7 +941,7 @@ static int ip_error(struct sk_buff *skb)
}
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
- vrf_master_ifindex(skb->dev), 1);
+ l3mdev_master_ifindex(skb->dev), 1);
send = true;
if (peer) {
@@ -1152,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
pr_debug("%s: %pI4 -> %pI4, %s\n",
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1487,9 +1487,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
skb->protocol != htons(ETH_P_IP))
goto e_inval;
- if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
- if (ipv4_is_loopback(saddr))
- goto e_inval;
+ if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
+ goto e_inval;
if (ipv4_is_zeronet(saddr)) {
if (!ipv4_is_local_multicast(daddr))
@@ -1652,6 +1651,48 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+ const struct iphdr *outer_iph = ip_hdr(skb);
+ struct icmphdr _icmph;
+ const struct icmphdr *icmph;
+ struct iphdr _inner_iph;
+ const struct iphdr *inner_iph;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ goto standard_hash;
+
+ icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+ &_icmph);
+ if (!icmph)
+ goto standard_hash;
+
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_REDIRECT &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB) {
+ goto standard_hash;
+ }
+
+ inner_iph = skb_header_pointer(skb,
+ outer_iph->ihl * 4 + sizeof(_icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto standard_hash;
+
+ return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+ return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1659,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ if (res->fi && res->fi->fib_nhs > 1) {
+ int h;
+
+ if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+ h = ip_multipath_icmp_hash(skb);
+ else
+ h = fib_multipath_hash(saddr, daddr);
+ fib_select_multipath(res, h);
+ }
#endif
/* create a routing cache entry */
@@ -1740,10 +1788,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
* Now we are ready to route packet.
*/
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex;
+ fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
err = fib_lookup(net, &fl4, &res, 0);
@@ -1760,7 +1809,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
goto local_input;
}
@@ -1782,7 +1831,7 @@ brd_input:
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
in_dev, &itag);
if (err < 0)
- goto martian_source_keep_err;
+ goto martian_source;
}
flags |= RTCF_BROADCAST;
res.type = RTN_BROADCAST;
@@ -1858,8 +1907,6 @@ e_nobufs:
goto out;
martian_source:
- err = -EINVAL;
-martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
goto out;
}
@@ -2028,7 +2075,8 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ int mp_hash)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2127,11 +2175,10 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
- if (netif_is_vrf(dev_out) &&
- !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
- rth = vrf_dev_get_rth(dev_out);
+
+ rth = l3mdev_get_rtable(dev_out, fl4);
+ if (rth)
goto out;
- }
}
if (!fl4->daddr) {
@@ -2149,7 +2196,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif) {
+ if (fl4->flowi4_oif &&
+ !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2191,18 +2239,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
- else
-#endif
- if (!res.prefixlen &&
- res.table->tb_num_default > 1 &&
- res.type == RTN_UNICAST && !fl4->flowi4_oif)
- fib_select_default(fl4, &res);
-
- if (!fl4->saddr)
- fl4->saddr = FIB_RES_PREFSRC(net, res);
+ fib_select_path(net, &res, fl4, mp_hash);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
@@ -2215,7 +2252,7 @@ out:
rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -2267,7 +2304,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
new->dev = ort->dst.dev;
if (new->dev)
@@ -2471,6 +2508,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ if (netif_index_is_l3_master(net, fl4.flowi4_oif))
+ fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 6595affded20..4cbe9f0a4281 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -192,15 +192,11 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
-__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
- __u16 *mssp)
+__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- tcp_synq_overflow(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
return __cookie_v4_init_sequence(iph, th, mssp);
}
@@ -225,10 +221,13 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
+ bool own_req;
- child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ NULL, &own_req);
if (child) {
atomic_set(&req->rsk_refcnt, 1);
+ sock_rps_save_rxhash(child, skb);
inet_csk_reqsk_queue_add(sk, req, child);
} else {
reqsk_free(req);
@@ -288,6 +287,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
}
EXPORT_SYMBOL(cookie_ecn_ok);
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ * NULL if memory could not be allocated.
+ */
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
@@ -326,7 +329,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
if (!req)
goto out;
@@ -381,10 +384,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 894da3a70aff..25300c5e283b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_recovery",
+ .data = &sysctl_tcp_recovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -577,6 +584,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_min_rtt_wlen",
+ .data = &sysctl_tcp_min_rtt_wlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8b8fa184f75..0cfa7c0c1e80 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ tp->rtt_min[0].rtt = ~0U;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -900,7 +901,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto out_err;
}
@@ -967,7 +969,8 @@ new_segment:
copied += copy;
offset += copy;
- if (!(size -= copy)) {
+ size -= copy;
+ if (!size) {
tcp_tx_timestamp(sk, skb);
goto out;
}
@@ -988,7 +991,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1111,7 +1115,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto do_error;
}
@@ -1267,7 +1272,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1767,7 +1773,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* __ Restore normal policy in scheduler __ */
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
@@ -1778,7 +1785,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
do_prequeue:
tcp_prequeue_process(sk);
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
@@ -2230,7 +2238,8 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
- if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq += tp->max_window + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
@@ -2253,13 +2262,6 @@ int tcp_disconnect(struct sock *sk, int flags)
}
EXPORT_SYMBOL(tcp_disconnect);
-void tcp_sock_destruct(struct sock *sk)
-{
- inet_sock_destruct(sk);
-
- kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
-}
-
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
@@ -2581,7 +2583,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
TCPF_LISTEN))) {
tcp_fastopen_init_key_once(true);
- err = fastopen_init_queue(sk, val);
+ fastopen_queue_tune(sk, val);
} else {
err = -EINVAL;
}
@@ -2849,10 +2851,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_FASTOPEN:
- if (icsk->icsk_accept_queue.fastopenq)
- val = icsk->icsk_accept_queue.fastopenq->max_qlen;
- else
- val = 0;
+ val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break;
case TCP_TIMESTAMP:
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 7092a61c4dc8..7e538f71f5fb 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -209,7 +209,7 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
/* alpha = (1 - g) * alpha + g * F */
- alpha -= alpha >> dctcp_shift_g;
+ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
if (bytes_ecn) {
/* If dctcp_shift_g == 1, a 32bit value would overflow
* after 8 Mbytes.
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index db43c6286cf7..55be6ac70cff 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -133,18 +133,20 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
u32 end_seq;
+ bool own_req;
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ NULL, &own_req);
if (!child)
return NULL;
- spin_lock(&queue->fastopenq->lock);
- queue->fastopenq->qlen++;
- spin_unlock(&queue->fastopenq->lock);
+ spin_lock(&queue->fastopenq.lock);
+ queue->fastopenq.qlen++;
+ spin_unlock(&queue->fastopenq.lock);
/* Initialize the child socket. Have to fix some values to take
* into account the child is a Fast Open socket and is created
@@ -161,15 +163,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
/* Activate the retrans timer so that SYNACK can be retransmitted.
- * The request socket is not added to the SYN table of the parent
+ * The request socket is not added to the ehash
* because it's been added to the accept queue directly.
*/
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
- atomic_set(&req->rsk_refcnt, 1);
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
+ atomic_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
inet_csk(child)->icsk_af_ops->rebuild_header(child);
@@ -178,12 +178,10 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- * Note that IPv6 might also have used skb_get() trick
- * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
- * So we need to eventually get a clone of the packet,
- * before inserting it in sk_receive_queue.
+ /* Queue the data carried in the SYN packet.
+ * We used to play tricky games with skb_get().
+ * With lockless listener, it is a dead end.
+ * Do not think about it.
*
* XXX (TFO) - we honor a zero-payload TFO request for now,
* (any reason not to?) but no need to queue the skb since
@@ -191,12 +189,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
*/
end_seq = TCP_SKB_CB(skb)->end_seq;
if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2;
-
- if (unlikely(skb_shared(skb)))
- skb2 = skb_clone(skb, GFP_ATOMIC);
- else
- skb2 = skb_get(skb);
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (likely(skb2)) {
skb_dst_drop(skb2);
@@ -214,12 +207,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
}
}
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
- sk->sk_data_ready(sk);
- bh_unlock_sock(child);
- /* Note: sock_put(child) will be done by tcp_conn_request()
- * after SYNACK packet is sent.
+ /* tcp_conn_request() is sending the SYNACK,
+ * and queues the child into listener accept queue.
*/
- WARN_ON(!req->sk);
return child;
}
@@ -237,8 +227,8 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* between qlen overflow causing Fast Open to be disabled
* temporarily vs a server not supporting Fast Open at all.
*/
- fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (!fastopenq || fastopenq->max_qlen == 0)
+ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (fastopenq->max_qlen == 0)
return false;
if (fastopenq->qlen >= fastopenq->max_qlen) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4964d53907e9..fdd88c3803a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
+int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > 0)
tcp_disable_early_retrans(tp);
+ tp->rack.reord = 1;
}
/* This must be called before lost_out is incremented */
@@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
}
}
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
- struct sk_buff *skb)
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
@@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
return !before(start_seq, end_seq - tp->max_window);
}
-/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
- * for reordering! Ugly, but should help.
- *
- * Search retransmitted skbs from write_queue that were sent when snd_nxt was
- * less than what is now known to be received by the other end (derived from
- * highest SACK block). Also calculate the lowest snd_nxt among the remaining
- * retransmitted skbs to avoid some costly processing per ACKs.
- */
-static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
- u32 new_low_seq = tp->snd_nxt;
- u32 received_upto = tcp_highest_sack_seq(tp);
-
- if (!tcp_is_fack(tp) || !tp->retrans_out ||
- !after(received_upto, tp->lost_retrans_low) ||
- icsk->icsk_ca_state != TCP_CA_Recovery)
- return;
-
- tcp_for_write_queue(skb, sk) {
- u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
-
- if (skb == tcp_send_head(sk))
- break;
- if (cnt == tp->retrans_out)
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
- continue;
-
- /* TODO: We would like to get rid of tcp_is_fack(tp) only
- * constraint here (see above) but figuring out that at
- * least tp->reordering SACK blocks reside between ack_seq
- * and received_upto is not easy task to do cheaply with
- * the available datastructures.
- *
- * Whether FACK should check here for tp->reordering segs
- * in-between one could argue for either way (it would be
- * rather simple to implement as we could count fack_count
- * during the walk and do tp->fackets_out - fack_count).
- */
- if (after(received_upto, ack_seq)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- *flag |= FLAG_LOST_RETRANS;
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
- } else {
- if (before(ack_seq, new_low_seq))
- new_low_seq = ack_seq;
- cnt += tcp_skb_pcount(skb);
- }
- }
-
- if (tp->retrans_out)
- tp->lost_retrans_low = new_low_seq;
-}
-
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
@@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
+ tcp_rack_advance(tp, xmit_time, sacked);
+
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
@@ -1837,7 +1776,6 @@ advance_sp:
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp);
out:
@@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+ return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, when);
+}
+
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
- (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+ tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}
/* Undo procedures. */
@@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+ tcp_rack_mark_lost(sk))
+ flag |= FLAG_LOST_RETRANS;
+
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk);
}
+/* Kathleen Nichols' algorithm for tracking the minimum value of
+ * a data stream over some fixed time interval. (E.g., the minimum
+ * RTT over the past five minutes.) It uses constant space and constant
+ * time per update yet almost always delivers the same minimum as an
+ * implementation that has to keep all the data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of the
+ * n'th best >= n-1'th best. It also makes sure that the three values
+ * are widely separated in the time window since that bounds the worse
+ * case error when that data is monotonically increasing over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because it
+ * has no value - the new min is <= everything else in the window by
+ * definition and it's the most recent. So we restart fresh on every new min
+ * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
+ * best.
+ */
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+{
+ const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
+ struct rtt_meas *m = tcp_sk(sk)->rtt_min;
+ struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+ u32 elapsed;
+
+ /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
+ if (unlikely(rttm.rtt <= m[0].rtt))
+ m[0] = m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[1].rtt)
+ m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[2].rtt)
+ m[2] = rttm;
+
+ elapsed = now - m[0].ts;
+ if (unlikely(elapsed > wlen)) {
+ /* Passed entire window without a new min so make 2nd choice
+ * the new min & 3rd choice the new 2nd. So forth and so on.
+ */
+ m[0] = m[1];
+ m[1] = m[2];
+ m[2] = rttm;
+ if (now - m[0].ts > wlen) {
+ m[0] = m[1];
+ m[1] = rttm;
+ if (now - m[0].ts > wlen)
+ m[0] = rttm;
+ }
+ } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
+ /* Passed a quarter of the window without a new min so
+ * take 2nd choice from the 2nd quarter of the window.
+ */
+ m[2] = m[1] = rttm;
+ } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
+ /* Passed half the window without a new min so take the 3rd
+ * choice from the last half of the window.
+ */
+ m[2] = rttm;
+ }
+}
+
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- long seq_rtt_us, long sack_rtt_us)
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
- if (flag & FLAG_RETRANS_DATA_ACKED)
- seq_rtt_us = -1L;
-
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
@@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-
+ seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
+ tp->rx_opt.rcv_tsecr);
if (seq_rtt_us < 0)
return false;
+ /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+ * always taken together with ACK, SACK, or TS-opts. Any negative
+ * values will be skipped with the seq_rtt_us < 0 check above.
+ */
+ tcp_update_rtt_min(sk, ca_rtt_us);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -2964,7 +2985,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
}
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L);
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
}
@@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
+ else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
- if (likely(first_ackt.v64)) {
+ if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
}
@@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
- rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+ ca_rtt_us);
if (flag & FLAG_ACKED) {
tcp_rearm_rto(sk);
@@ -5472,7 +5496,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5698,11 +5722,11 @@ reset_and_undo:
* address independent.
*/
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;
@@ -5749,7 +5773,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
case TCP_SYN_SENT:
- queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -6022,7 +6046,7 @@ static void tcp_openreq_init(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
@@ -6042,9 +6066,11 @@ static void tcp_openreq_init(struct request_sock *req,
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener)
+ struct sock *sk_listener,
+ bool attach_listener)
{
- struct request_sock *req = reqsk_alloc(ops, sk_listener);
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
@@ -6064,13 +6090,13 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(struct sock *sk,
+static bool tcp_syn_flood_action(const struct sock *sk,
const struct sk_buff *skb,
const char *proto)
{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
- struct listen_sock *lopt;
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
@@ -6081,12 +6107,12 @@ static bool tcp_syn_flood_action(struct sock *sk,
#endif
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
- if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
- lopt->synflood_warned = 1;
+ if (!queue->synflood_warned &&
+ sysctl_tcp_syncookies != 2 &&
+ xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
- }
+
return want_cookie;
}
@@ -6120,8 +6146,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct request_sock *req;
bool want_cookie = false;
struct flowi fl;
- int err;
-
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
@@ -6145,7 +6169,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops, sk);
+ req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
@@ -6230,21 +6254,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
- if (!want_cookie)
+ if (!want_cookie) {
+ tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
- err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req,
- skb_get_queue_mapping(skb), &foc);
+ }
if (fastopen_sk) {
+ af_ops->send_synack(fastopen_sk, dst, &fl, req,
+ &foc, false);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
} else {
- if (err || want_cookie)
- goto drop_and_free;
-
tcp_rsk(req)->tfo_listener = false;
- af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ if (!want_cookie)
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ af_ops->send_synack(sk, dst, &fl, req,
+ &foc, !want_cookie);
+ if (want_cookie)
+ goto drop_and_free;
}
- tcp_reqsk_record_syn(sk, req, skb);
-
+ reqsk_put(req);
return 0;
drop_and_release:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a23ba7daecbf..1c2648bbac4b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- reqsk_put(req);
} else {
/*
* Still in SYN_RECV, just remove it silently.
@@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
}
+ reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
@@ -576,7 +576,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -795,7 +795,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
inet_twsk_put(tw);
}
-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
@@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
*/
tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
@@ -821,8 +821,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
- u16 queue_mapping,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -833,12 +833,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc);
+ skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- skb_set_queue_mapping(skb, queue_mapping);
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
ireq->opt);
@@ -1112,10 +1111,13 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+#endif
+
/* Called with rcu_read_lock() */
-static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb)
{
+#ifdef CONFIG_TCP_MD5SIG
/*
* This gets called for each TCP segment that arrives
* so we want to be efficient.
@@ -1165,8 +1167,9 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk,
return true;
}
return false;
-}
#endif
+ return false;
+}
static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
@@ -1180,7 +1183,8 @@ static void tcp_v4_init_req(struct request_sock *req,
ireq->opt = tcp_v4_save_options(skb);
}
-static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct flowi *fl,
const struct request_sock *req,
bool *strict)
{
@@ -1219,7 +1223,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_sequence,
.send_synack = tcp_v4_send_synack,
- .queue_hash_add = inet_csk_reqsk_queue_hash_add,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1242,9 +1245,11 @@ EXPORT_SYMBOL(tcp_v4_conn_request);
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
-struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst)
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
@@ -1320,7 +1325,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- __inet_hash_nolisten(newsk, NULL);
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
return newsk;
@@ -1338,34 +1343,11 @@ put_and_exit:
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
+#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
- const struct iphdr *iph = ip_hdr(skb);
- struct request_sock *req;
- struct sock *nsk;
-
- req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
- if (req) {
- nsk = tcp_check_req(sk, skb, req, false);
- if (!nsk || nsk == sk)
- reqsk_put(req);
- return nsk;
- }
-
- nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
- th->source, iph->daddr, th->dest, inet_iif(skb));
-
- if (nsk) {
- if (nsk->sk_state != TCP_TIME_WAIT) {
- bh_lock_sock(nsk);
- return nsk;
- }
- inet_twsk_put(inet_twsk(nsk));
- return NULL;
- }
-#ifdef CONFIG_SYN_COOKIES
if (!th->syn)
sk = cookie_v4_check(sk, skb);
#endif
@@ -1373,7 +1355,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
}
/* The socket must have it's spinlock held when we get
- * here.
+ * here, unless it is a TCP_LISTEN socket.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
@@ -1404,13 +1386,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
- struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ struct sock *nsk = tcp_v4_cookie_check(sk, skb);
+
if (!nsk)
goto discard;
-
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
- sk_mark_napi_id(sk, skb);
+ sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1420,7 +1402,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+ if (tcp_rcv_state_process(sk, skb)) {
rsk = sk;
goto reset;
}
@@ -1590,6 +1572,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1598,6 +1581,33 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+ struct sock *nsk = NULL;
+
+ sk = req->rsk_listener;
+ if (tcp_v4_inbound_md5_hash(sk, skb))
+ goto discard_and_relse;
+ if (likely(sk->sk_state == TCP_LISTEN)) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ } else {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ if (!nsk) {
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (nsk == sk) {
+ sock_hold(sk);
+ reqsk_put(req);
+ } else if (tcp_child_process(sk, nsk, skb)) {
+ tcp_v4_send_reset(nsk, skb);
+ goto discard_it;
+ } else {
+ return 0;
+ }
+ }
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
@@ -1606,25 +1616,23 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * We really want to reject the packet as early as possible
- * if:
- * o We're expecting an MD5'd packet and this is no MD5 tcp option
- * o There is an MD5 option and we're not expecting one
- */
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse;
-#endif
nf_reset(skb);
if (sk_filter(sk, skb))
goto discard_and_relse;
- sk_incoming_cpu_update(sk);
skb->dev = NULL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = tcp_v4_do_rcv(sk, skb);
+ goto put_and_return;
+ }
+
+ sk_incoming_cpu_update(sk);
+
bh_lock_sock_nested(sk);
tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0;
@@ -1639,6 +1647,7 @@ process:
}
bh_unlock_sock(sk);
+put_and_return:
sock_put(sk);
return ret;
@@ -1833,35 +1842,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- if (st->state == TCP_SEQ_STATE_OPENREQ) {
- struct request_sock *req = cur;
-
- icsk = inet_csk(st->syn_wait_sk);
- req = req->dl_next;
- while (1) {
- while (req) {
- if (req->rsk_ops->family == st->family) {
- cur = req;
- goto out;
- }
- req = req->dl_next;
- }
- if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
- break;
-get_req:
- req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
- }
- sk = sk_nulls_next(st->syn_wait_sk);
- st->state = TCP_SEQ_STATE_LISTENING;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- } else {
- icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue))
- goto start_req;
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- sk = sk_nulls_next(sk);
- }
+ sk = sk_nulls_next(sk);
get_sk:
sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
@@ -1871,16 +1852,6 @@ get_sk:
goto out;
}
icsk = inet_csk(sk);
- spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
-start_req:
- st->uid = sock_i_uid(sk);
- st->syn_wait_sk = sk;
- st->state = TCP_SEQ_STATE_OPENREQ;
- st->sbucket = 0;
- goto get_req;
- }
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
@@ -2012,7 +1983,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
void *rc = NULL;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE)
break;
@@ -2071,7 +2041,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
@@ -2096,11 +2065,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
struct tcp_iter_state *st = seq->private;
switch (st->state) {
- case TCP_SEQ_STATE_OPENREQ:
- if (v) {
- struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
- spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
- }
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
@@ -2154,7 +2118,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct request_sock *req,
- struct seq_file *f, int i, kuid_t uid)
+ struct seq_file *f, int i)
{
const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->rsk_timer.expires - jiffies;
@@ -2171,7 +2135,8 @@ static void get_openreq4(const struct request_sock *req,
1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta),
req->num_timeout,
- from_kuid_munged(seq_user_ns(f), uid),
+ from_kuid_munged(seq_user_ns(f),
+ sock_i_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2185,7 +2150,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct inet_sock *inet = inet_sk(sk);
- struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
__be32 dest = inet->inet_daddr;
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
@@ -2272,18 +2237,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
- switch (st->state) {
- case TCP_SEQ_STATE_LISTENING:
- case TCP_SEQ_STATE_ESTABLISHED:
- if (sk->sk_state == TCP_TIME_WAIT)
- get_timewait4_sock(v, seq, st->num);
- else
- get_tcp4_sock(v, seq, st->num);
- break;
- case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(v, seq, st->num, st->uid);
- break;
- }
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ get_openreq4(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
out:
seq_pad(seq, '\n');
return 0;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e4fe62b6b106..3575dd1e5b67 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req,
window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
- req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > full_space || req->window_clamp == 0))
- req->window_clamp = full_space;
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
+ &req->rsk_rcv_wnd,
+ &req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
@@ -441,7 +441,9 @@ EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
-struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(const struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
@@ -468,6 +470,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newtp->rtt_min[0].rtt = ~0U;
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -510,9 +513,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_ssthresh = req->rcv_wnd;
- newtp->rcv_wnd = req->rcv_wnd;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
@@ -545,6 +548,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
+ newtp->rack.mstamp.v64 = 0;
+ newtp->rack.advanced = 0;
newtp->saved_syn = req->saved_syn;
req->saved_syn = NULL;
@@ -575,8 +580,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false;
-
- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+ bool own_req;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -707,7 +711,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
@@ -764,17 +768,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+ req, &own_req);
if (!child)
goto listen_overflow;
+ sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
- inet_csk_reqsk_queue_drop(sk, req);
- inet_csk_reqsk_queue_add(sk, req, child);
- /* Warning: caller must not call reqsk_put(req);
- * child stole last reference on it.
- */
- return child;
+ return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
@@ -821,8 +822,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int state = child->sk_state;
if (!sock_owned_by_user(child)) {
- ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
- skb->len);
+ ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 09bb082ca1a7..f4f9793eb025 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
- if (!tp->retrans_out)
- tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
@@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- /* snd_nxt is stored to detect loss of retransmitted segment,
- * see tcp_input.c tcp_sacktag_write_queue().
- */
- TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
} else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
@@ -2947,7 +2941,8 @@ int tcp_send_synack(struct sock *sk)
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc,
+ bool attach_req)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2959,11 +2954,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
u16 user_mss;
int mss;
- /* sk is a const pointer, because we want to express multiple cpus
- * might call us concurrently.
- * sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way.
- */
- skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -2971,6 +2962,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
+ if (attach_req) {
+ skb->destructor = sock_edemux;
+ sock_hold(req_to_sk(req));
+ skb->sk = req_to_sk(req);
+ } else {
+ /* sk is a const pointer, because we want to express multiple
+ * cpu might call us concurrently.
+ * sk->sk_wmem_alloc in an atomic, we can promote to rw.
+ */
+ skb_set_owner_w(skb, (struct sock *)sk);
+ }
skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst);
@@ -3015,7 +3017,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
- th->window = htons(min(req->rcv_wnd, 65535U));
+ th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
@@ -3408,7 +3410,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
- NET_INC_STATS_BH(sock_net(sk), mib);
+ NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -3510,7 +3512,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000000..5353085fd0b2
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,109 @@
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ u32 reo_wnd, prior_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+ return 0;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay
+ * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+ * RTT because reordering is often a path property and less related
+ * to queuing or delayed ACKs.
+ *
+ * TODO: measure and adapt to the observed reordering delay, and
+ * use a timer to retransmit like the delayed early retransmit.
+ */
+ reo_wnd = 1000;
+ if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+
+ tcp_for_write_queue(skb, sk) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* Skip ones already (s)acked */
+ if (!after(scb->end_seq, tp->snd_una) ||
+ scb->sacked & TCPCB_SACKED_ACKED)
+ continue;
+
+ if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+
+ if (skb_mstamp_us_delta(&tp->rack.mstamp,
+ &skb->skb_mstamp) <= reo_wnd)
+ continue;
+
+ /* skb is lost if packet sent later is sacked */
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (scb->sacked & TCPCB_SACKED_RETRANS) {
+ scb->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ } else if (!(scb->sacked & TCPCB_RETRANS)) {
+ /* Original data are sent sequentially so stop early
+ * b/c the rest are all sent after rack_sent
+ */
+ break;
+ }
+ }
+ return prior_retrans - tp->retrans_out;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets */
+void tcp_rack_advance(struct tcp_sock *tp,
+ const struct skb_mstamp *xmit_time, u8 sacked)
+{
+ if (tp->rack.mstamp.v64 &&
+ !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ return;
+
+ if (sacked & TCPCB_RETRANS) {
+ struct skb_mstamp now;
+
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ skb_mstamp_get(&now);
+ if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ return;
+ }
+
+ tp->rack.mstamp = *xmit_time;
+ tp->rack.advanced = 1;
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7149ebc820c7..c9c716a483e4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
}
/* Calculate maximal number or retries on an orphaned socket. */
-static int tcp_orphan_retries(struct sock *sk, int alive)
+static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sysctl_tcp_orphan_retries; /* May be zero. */
@@ -184,7 +184,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -298,7 +298,7 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f7d1d5e19e95..24ec14f9825c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
-
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
return score;
}
@@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
@@ -1017,30 +1021,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- /* unconnected socket. If output device is enslaved to a VRF
- * device lookup source address from VRF table. This mimics
- * behavior of ip_route_connect{_init}.
- */
- if (netif_index_is_vrf(net, ipc.oif)) {
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
- (flow_flags | FLOWI_FLAG_VRFSRC |
- FLOWI_FLAG_SKIP_NH_OIF),
- faddr, saddr, dport,
- inet->inet_sport);
-
- rt = ip_route_output_flow(net, fl4, sk);
- if (!IS_ERR(rt)) {
- saddr = fl4->saddr;
- ip_rt_put(rt);
- }
- }
-
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport);
+ if (!saddr && ipc.oif)
+ l3mdev_get_saddr(net, ipc.oif, fl4);
+
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index cd6be736e19f..7ee6518afa86 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
mtu = dst_mtu(skb_dst(skb));
if (skb->len > mtu) {
+ skb->protocol = htons(ETH_P_IP);
+
if (skb->sk)
xfrm_local_error(skb, mtu);
else
@@ -87,17 +89,15 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
return x->outer_mode->afinfo->output_finish(sk, skb);
}
-int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
-
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, NULL, skb_dst(skb)->dev,
__xfrm4_output,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 269b137c87ec..1e0c3c835a63 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -15,7 +15,7 @@
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/vrf.h>
+#include <net/l3mdev.h>
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
@@ -111,10 +111,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
struct flowi4 *fl4 = &fl->u.ip4;
int oif = 0;
- if (skb_dst(skb)) {
- oif = vrf_master_ifindex(skb_dst(skb)->dev) ?
- : skb_dst(skb)->dev->ifindex;
- }
+ if (skb_dst(skb))
+ oif = l3mdev_fib_oif(skb_dst(skb)->dev);
memset(fl4, 0, sizeof(struct flowi4));
fl4->flowi4_mark = skb->mark;