aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/rtnetlink.h3
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/dst.h6
-rw-r--r--include/net/flow.h5
-rw-r--r--include/net/inet_connection_sock.h1
-rw-r--r--include/net/inet_sock.h2
-rw-r--r--include/net/inetpeer.h8
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--include/net/route.h61
-rw-r--r--include/net/tcp.h9
-rw-r--r--net/core/rtnetlink.c4
-rw-r--r--net/decnet/dn_route.c13
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inetpeer.c4
-rw-r--r--net/ipv4/route.c349
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c188
-rw-r--r--net/ipv4/tcp_ipv4.c46
-rw-r--r--net/ipv4/tcp_metrics.c697
-rw-r--r--net/ipv4/tcp_minisocks.c62
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv6/icmp.c4
-rw-r--r--net/ipv6/ip6_output.c10
-rw-r--r--net/ipv6/ndisc.c8
-rw-r--r--net/ipv6/route.c16
-rw-r--r--net/ipv6/tcp_ipv6.c49
29 files changed, 837 insertions, 731 deletions
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ea60b0854109..db71c4ad8624 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -619,8 +619,7 @@ extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
- u32 id, u32 ts, u32 tsage, long expires,
- u32 error);
+ u32 id, long expires, u32 error);
extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7d3bcedc062a..2de9cf46f9fc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -506,7 +506,6 @@ struct tcp_timewait_sock {
u32 tw_rcv_wnd;
u32 tw_ts_recent;
long tw_ts_recent_stamp;
- struct inet_peer *tw_peer;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
#endif
diff --git a/include/net/dst.h b/include/net/dst.h
index b2634e446613..51610468c63d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -209,12 +209,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
return msecs_to_jiffies(dst_metric(dst, metric));
}
-static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
- unsigned long rtt)
-{
- dst_metric_set(dst, metric, jiffies_to_msecs(rtt));
-}
-
static inline u32
dst_allfrag(const struct dst_entry *dst)
{
diff --git a/include/net/flow.h b/include/net/flow.h
index bd524f598561..ce9cb7656b47 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -20,9 +20,8 @@ struct flowi_common {
__u8 flowic_proto;
__u8 flowic_flags;
#define FLOWI_FLAG_ANYSRC 0x01
-#define FLOWI_FLAG_PRECOW_METRICS 0x02
-#define FLOWI_FLAG_CAN_SLEEP 0x04
-#define FLOWI_FLAG_RT_NOCACHE 0x08
+#define FLOWI_FLAG_CAN_SLEEP 0x02
+#define FLOWI_FLAG_RT_NOCACHE 0x04
__u32 flowic_secid;
};
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index af3c743a40e4..291e7cee14e7 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -43,7 +43,6 @@ struct inet_connection_sock_af_ops {
struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst);
- struct inet_peer *(*get_peer)(struct sock *sk);
u16 net_header_len;
u16 net_frag_header_len;
u16 sockaddr_len;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ae17e1352d7e..924d7b98ab60 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -245,8 +245,6 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
flags |= FLOWI_FLAG_ANYSRC;
- if (sk->sk_protocol == IPPROTO_TCP)
- flags |= FLOWI_FLAG_PRECOW_METRICS;
return flags;
}
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index c27c8f10ebdc..53f464d7cddc 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -36,25 +36,19 @@ struct inet_peer {
u32 metrics[RTAX_MAX];
u32 rate_tokens; /* rate limiting for ICMP */
unsigned long rate_last;
- unsigned long pmtu_expires;
- u32 pmtu_orig;
- u32 pmtu_learned;
- struct inetpeer_addr_base redirect_learned;
union {
struct list_head gc_list;
struct rcu_head gc_rcu;
};
/*
* Once inet_peer is queued for deletion (refcnt == -1), following fields
- * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp
+ * are not available: rid, ip_id_count
* We can share memory with rcu_head to help keep inet_peer small.
*/
union {
struct {
atomic_t rid; /* Frag reception counter */
atomic_t ip_id_count; /* IP ID for the next packet */
- __u32 tcp_ts;
- __u32 tcp_ts_stamp;
};
struct rcu_head rcu;
struct inet_peer *gc_next;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 599e48fa97cb..2e089a99d603 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
#include <net/inet_frag.h>
+struct tcpm_hash_bucket;
struct ctl_table_header;
struct ipv4_devconf;
struct fib_rules_ops;
@@ -39,6 +40,8 @@ struct netns_ipv4 {
struct sock **icmp_sk;
struct sock *tcp_sock;
struct inet_peer_base *peers;
+ struct tcpm_hash_bucket *tcp_metrics_hash;
+ unsigned int tcp_metrics_hash_mask;
struct netns_frags frags;
#ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter;
diff --git a/include/net/route.h b/include/net/route.h
index 211e2665139b..52362368af09 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -40,7 +40,6 @@
#define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
struct fib_nh;
-struct inet_peer;
struct fib_info;
struct rtable {
struct dst_entry dst;
@@ -65,45 +64,10 @@ struct rtable {
__be32 rt_gateway;
/* Miscellaneous cached information */
- u32 rt_peer_genid;
- unsigned long _peer; /* long-living peer info */
+ u32 rt_pmtu;
struct fib_info *fi; /* for client ref to shared metrics */
};
-static inline struct inet_peer *rt_peer_ptr(struct rtable *rt)
-{
- return inetpeer_ptr(rt->_peer);
-}
-
-static inline bool rt_has_peer(struct rtable *rt)
-{
- return inetpeer_ptr_is_peer(rt->_peer);
-}
-
-static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer)
-{
- __inetpeer_ptr_set_peer(&rt->_peer, peer);
-}
-
-static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer)
-{
- return inetpeer_ptr_set_peer(&rt->_peer, peer);
-}
-
-static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base)
-{
- inetpeer_init_ptr(&rt->_peer, base);
-}
-
-static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort)
-{
- rt->_peer = ort->_peer;
- if (rt_has_peer(ort)) {
- struct inet_peer *peer = rt_peer_ptr(ort);
- atomic_inc(&peer->refcnt);
- }
-}
-
static inline bool rt_is_input_route(const struct rtable *rt)
{
return rt->rt_route_iif != 0;
@@ -278,8 +242,6 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
if (inet_sk(sk)->transparent)
flow_flags |= FLOWI_FLAG_ANYSRC;
- if (protocol == IPPROTO_TCP)
- flow_flags |= FLOWI_FLAG_PRECOW_METRICS;
if (can_sleep)
flow_flags |= FLOWI_FLAG_CAN_SLEEP;
@@ -328,27 +290,6 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable
return rt;
}
-extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create);
-
-static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create)
-{
- if (rt_has_peer(rt))
- return rt_peer_ptr(rt);
-
- rt_bind_peer(rt, daddr, create);
- return (rt_has_peer(rt) ? rt_peer_ptr(rt) : NULL);
-}
-
-static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr)
-{
- return __rt_get_peer(rt, daddr, 0);
-}
-
-static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr)
-{
- return __rt_get_peer(rt, daddr, 1);
-}
-
static inline int inet_iif(const struct sk_buff *skb)
{
return skb_rtable(skb)->rt_iif;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 53fb7d814170..3618fefae049 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -388,6 +388,13 @@ extern void tcp_enter_frto(struct sock *sk);
extern void tcp_enter_loss(struct sock *sk, int how);
extern void tcp_clear_retrans(struct tcp_sock *tp);
extern void tcp_update_metrics(struct sock *sk);
+extern void tcp_init_metrics(struct sock *sk);
+extern void tcp_metrics_init(void);
+extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
+extern bool tcp_remember_stamp(struct sock *sk);
+extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
+extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
+extern void tcp_disable_fack(struct tcp_sock *tp);
extern void tcp_close(struct sock *sk, long timeout);
extern void tcp_init_sock(struct sock *sk);
extern unsigned int tcp_poll(struct file * file, struct socket *sock,
@@ -556,6 +563,8 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
return (tp->srtt >> 3) + tp->rttvar;
}
+extern void tcp_set_rto(struct sock *sk);
+
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2b325c340b44..64127eee786d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -615,7 +615,7 @@ nla_put_failure:
EXPORT_SYMBOL(rtnetlink_put_metrics);
int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
- u32 ts, u32 tsage, long expires, u32 error)
+ long expires, u32 error)
{
struct rta_cacheinfo ci = {
.rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
@@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
.rta_clntref = atomic_read(&(dst->__refcnt)),
.rta_error = error,
.rta_id = id,
- .rta_ts = ts,
- .rta_tsage = tsage,
};
if (expires)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6e74b3f110bc..b5594cc73ee1 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1590,7 +1590,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
goto errout;
expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
- if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires,
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
rt->dst.error) < 0)
goto errout;
@@ -1812,12 +1812,11 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
- rt->dst.dev ? rt->dst.dev->name : "*",
- dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
- dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
- atomic_read(&rt->dst.__refcnt),
- rt->dst.__use,
- (int) dst_metric(&rt->dst, RTAX_RTT));
+ rt->dst.dev ? rt->dst.dev->name : "*",
+ dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
+ dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
+ atomic_read(&rt->dst.__refcnt),
+ rt->dst.__use, 0);
return 0;
}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3bbcd6a..5a23e8b37106 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ae301c897a19..d71bfbdc0bf4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -794,6 +794,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
val = nla_get_u32(nla);
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
fi->fib_metrics[type - 1] = val;
}
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4bce5a2830aa..4a049449305f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -254,9 +254,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
/* Limit if icmp type is enabled in ratemask. */
if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
- struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr);
+ struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
rc = inet_peer_xrlim_allow(peer,
net->ipv4.sysctl_icmp_ratelimit);
+ inet_putpeer(peer);
}
out:
return rc;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 034ddbe42adf..76825be3b643 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -375,7 +375,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options_rcu *opt = inet_rsk(req)->opt;
struct net *net = sock_net(sk);
- int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS;
+ int flags = inet_sk_flowi_flags(sk);
if (nocache)
flags |= FLOWI_FLAG_RT_NOCACHE;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da90a8cab614..e1e0a4e8fd34 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -508,13 +508,9 @@ relookup:
(daddr->family == AF_INET) ?
secure_ip_id(daddr->addr.a4) :
secure_ipv6_id(daddr->addr.a6));
- p->tcp_ts_stamp = 0;
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
p->rate_last = 0;
- p->pmtu_expires = 0;
- p->pmtu_orig = 0;
- memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
INIT_LIST_HEAD(&p->gc_list);
/* Link the node. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 72e88c208025..95bfa1ba5b28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -158,34 +158,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
- struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer;
- u32 *p = NULL;
-
- peer = rt_get_peer_create(rt, rt->rt_dst);
- if (peer) {
- u32 *old_p = __DST_METRICS_PTR(old);
- unsigned long prev, new;
-
- p = peer->metrics;
- if (inet_metrics_new(peer))
- memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
-
- new = (unsigned long) p;
- prev = cmpxchg(&dst->_metrics, old, new);
-
- if (prev != old) {
- p = __DST_METRICS_PTR(prev);
- if (prev & DST_METRICS_READ_ONLY)
- p = NULL;
- } else {
- if (rt->fi) {
- fib_info_put(rt->fi);
- rt->fi = NULL;
- }
- }
- }
- return p;
+ WARN_ON(1);
+ return NULL;
}
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -423,18 +397,16 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
int len;
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
- "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->dst.dev ? r->dst.dev->name : "*",
- (__force u32)r->rt_dst,
- (__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->dst.__refcnt),
- r->dst.__use, 0, (__force u32)r->rt_src,
- dst_metric_advmss(&r->dst) + 40,
- dst_metric(&r->dst, RTAX_WINDOW),
- (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
- dst_metric(&r->dst, RTAX_RTTVAR)),
- r->rt_key_tos,
- -1, 0, 0, &len);
+ "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+ r->dst.dev ? r->dst.dev->name : "*",
+ (__force u32)r->rt_dst,
+ (__force u32)r->rt_gateway,
+ r->rt_flags, atomic_read(&r->dst.__refcnt),
+ r->dst.__use, 0, (__force u32)r->rt_src,
+ dst_metric_advmss(&r->dst) + 40,
+ dst_metric(&r->dst, RTAX_WINDOW), 0,
+ r->rt_key_tos,
+ -1, 0, 0, &len);
seq_printf(seq, "%*s\n", 127 - len, "");
}
@@ -671,7 +643,7 @@ static inline int rt_fast_clean(struct rtable *rth)
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
+ rth->dst.expires;
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -917,7 +889,6 @@ static void rt_cache_invalidate(struct net *net)
get_random_bytes(&shuffle, sizeof(shuffle));
atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
- inetpeer_invalidate_family(AF_INET);
}
/*
@@ -1244,31 +1215,6 @@ skip_hashing:
return rt;
}
-static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt_peer_genid(void)
-{
- return atomic_read(&__rt_peer_genid);
-}
-
-void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
-{
- struct inet_peer_base *base;
- struct inet_peer *peer;
-
- base = inetpeer_base_ptr(rt->_peer);
- if (!base)
- return;
-
- peer = inet_getpeer_v4(base, daddr, create);
- if (peer) {
- if (!rt_set_peer(rt, peer))
- inet_putpeer(peer);
- else
- rt->rt_peer_genid = rt_peer_genid();
- }
-}
-
/*
* Peer allocation may fail only in serious out-of-memory conditions. However
* we still can generate some output.
@@ -1291,20 +1237,15 @@ static void ip_select_fb_ident(struct iphdr *iph)
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
{
- struct rtable *rt = (struct rtable *) dst;
-
- if (rt && !(rt->dst.flags & DST_NOPEER)) {
- struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
+ struct net *net = dev_net(dst->dev);
+ struct inet_peer *peer;
- /* If peer is attached to destination, it is never detached,
- so that we need not to grab a lock to dereference it.
- */
- if (peer) {
- iph->id = htons(inet_getid(peer, more));
- return;
- }
- } else if (!rt)
- pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
+ peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
+ if (peer) {
+ iph->id = htons(inet_getid(peer, more));
+ inet_putpeer(peer);
+ return;
+ }
ip_select_fb_ident(iph);
}
@@ -1330,30 +1271,6 @@ static void rt_del(unsigned int hash, struct rtable *rt)
spin_unlock_bh(rt_hash_lock_addr(hash));
}
-static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
-{
- struct rtable *rt = (struct rtable *) dst;
- __be32 orig_gw = rt->rt_gateway;
- struct neighbour *n;
-
- dst_confirm(&rt->dst);
-
- rt->rt_gateway = peer->redirect_learned.a4;
-
- n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway);
- if (!n) {
- rt->rt_gateway = orig_gw;
- return;
- }
- if (!(n->nud_state & NUD_VALID)) {
- neigh_event_send(n, NULL);
- } else {
- rt->rt_flags |= RTCF_REDIRECTED;
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
- }
- neigh_release(n);
-}
-
/* called in rcu_read_lock() section */
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
@@ -1362,7 +1279,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
struct in_device *in_dev = __in_dev_get_rcu(dev);
__be32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
- struct inet_peer *peer;
struct net *net;
if (!in_dev)
@@ -1395,6 +1311,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rthp = &rt_hash_table[hash].chain;
while ((rt = rcu_dereference(*rthp)) != NULL) {
+ struct neighbour *n;
+
rthp = &rt->dst.rt_next;
if (rt->rt_key_dst != daddr ||
@@ -1408,13 +1326,16 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rt->rt_gateway != old_gw)
continue;
- peer = rt_get_peer_create(rt, rt->rt_dst);
- if (peer) {
- if (peer->redirect_learned.a4 != new_gw) {
- peer->redirect_learned.a4 = new_gw;
- atomic_inc(&__rt_peer_genid);
+ n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ if (n) {
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ rt->rt_gateway = new_gw;
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
- check_peer_redir(&rt->dst, peer);
+ neigh_release(n);
}
}
}
@@ -1432,23 +1353,6 @@ reject_redirect:
;
}
-static bool peer_pmtu_expired(struct inet_peer *peer)
-{
- unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
- return orig &&
- time_after_eq(jiffies, orig) &&
- cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-
-static bool peer_pmtu_cleaned(struct inet_peer *peer)
-{
- unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
-
- return orig &&
- cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
-}
-
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
{
struct rtable *rt = (struct rtable *)dst;
@@ -1458,16 +1362,13 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
- } else if (rt->rt_flags & RTCF_REDIRECTED) {
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->dst.expires) {
unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
rt->rt_oif,
rt_genid(dev_net(dst->dev)));
rt_del(hash, rt);
ret = NULL;
- } else if (rt_has_peer(rt)) {
- struct inet_peer *peer = rt_peer_ptr(rt);
- if (peer_pmtu_expired(peer))
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
}
}
return ret;
@@ -1494,6 +1395,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
struct inet_peer *peer;
+ struct net *net;
int log_martians;
rcu_read_lock();
@@ -1505,7 +1407,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
- peer = rt_get_peer_create(rt, rt->rt_dst);
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
if (!peer) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
return;
@@ -1522,7 +1425,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
*/
if (peer->rate_tokens >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
- return;
+ goto out_put_peer;
}
/* Check for load limit; set rate_last to the latest sent
@@ -1543,6 +1446,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
&rt->rt_dst, &rt->rt_gateway);
#endif
}
+out_put_peer:
+ inet_putpeer(peer);
}
static int ip_error(struct sk_buff *skb)
@@ -1585,7 +1490,7 @@ static int ip_error(struct sk_buff *skb)
break;
}
- peer = rt_get_peer_create(rt, rt->rt_dst);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
send = true;
if (peer) {
@@ -1598,6 +1503,7 @@ static int ip_error(struct sk_buff *skb)
peer->rate_tokens -= ip_rt_error_cost;
else
send = false;
+ inet_putpeer(peer);
}
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -1606,50 +1512,17 @@ out: kfree_skb(skb);
return 0;
}
-static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
-{
- unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
-
- if (!expires)
- return;
- if (time_before(jiffies, expires)) {
- u32 orig_dst_mtu = dst_mtu(dst);
- if (peer->pmtu_learned < orig_dst_mtu) {
- if (!peer->pmtu_orig)
- peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
- }
- } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
- dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
-}
-
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
struct rtable *rt = (struct rtable *) dst;
- struct inet_peer *peer;
dst_confirm(dst);
- peer = rt_get_peer_create(rt, rt->rt_dst);
- if (peer) {
- unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
-
- if (mtu < ip_rt_min_pmtu)
- mtu = ip_rt_min_pmtu;
- if (!pmtu_expires || mtu < peer->pmtu_learned) {
-
- pmtu_expires = jiffies + ip_rt_mtu_expires;
- if (!pmtu_expires)
- pmtu_expires = 1UL;
-
- peer->pmtu_learned = mtu;
- peer->pmtu_expires = pmtu_expires;
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
- atomic_inc(&__rt_peer_genid);
- rt->rt_peer_genid = rt_peer_genid();
- }
- check_peer_pmtu(dst, peer);
- }
+ rt->rt_pmtu = mtu;
+ dst_set_expires(&rt->dst, ip_rt_mtu_expires);
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1660,7 +1533,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
struct rtable *rt;
flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
- protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS,
+ protocol, flow_flags,
iph->daddr, iph->saddr, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
@@ -1681,30 +1554,12 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
-static void ipv4_validate_peer(struct rtable *rt)
-{
- if (rt->rt_peer_genid != rt_peer_genid()) {
- struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
-
- if (peer) {
- check_peer_pmtu(&rt->dst, peer);
-
- if (peer->redirect_learned.a4 &&
- peer->redirect_learned.a4 != rt->rt_gateway)
- check_peer_redir(&rt->dst, peer);
- }
-
- rt->rt_peer_genid = rt_peer_genid();
- }
-}
-
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
struct rtable *rt = (struct rtable *) dst;
if (rt_is_expired(rt))
return NULL;
- ipv4_validate_peer(rt);
return dst;
}
@@ -1716,10 +1571,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
fib_info_put(rt->fi);
rt->fi = NULL;
}
- if (rt_has_peer(rt)) {
- struct inet_peer *peer = rt_peer_ptr(rt);
- inet_putpeer(peer);
- }
}
@@ -1730,11 +1581,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb_rtable(skb);
- if (rt && rt_has_peer(rt)) {
- struct inet_peer *peer = rt_peer_ptr(rt);
- if (peer_pmtu_cleaned(peer))
- dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
- }
+ if (rt)
+ dst_set_expires(&rt->dst, 0);
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -1814,7 +1662,13 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
const struct rtable *rt = (const struct rtable *) dst;
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+ unsigned int mtu = rt->rt_pmtu;
+
+ if (mtu && time_after_eq(jiffies, rt->dst.expires))
+ mtu = 0;
+
+ if (!mtu)
+ mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu && rt_is_output_route(rt))
return mtu;
@@ -1836,63 +1690,27 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
struct fib_info *fi)
{
- struct inet_peer_base *base;
- struct inet_peer *peer;
- int create = 0;
-
- /* If a peer entry exists for this destination, we must hook
- * it up in order to get at cached metrics.
- */
- if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
- create = 1;
-
- base = inetpeer_base_ptr(rt->_peer);
- BUG_ON(!base);
-
- peer = inet_getpeer_v4(base, rt->rt_dst, create);
- if (peer) {
- __rt_set_peer(rt, peer);
- rt->rt_peer_genid = rt_peer_genid();
- if (inet_metrics_new(peer))
- memcpy(peer->metrics, fi->fib_metrics,
- sizeof(u32) * RTAX_MAX);
- dst_init_metrics(&rt->dst, peer->metrics, false);
-
- check_peer_pmtu(&rt->dst, peer);
-
- if (peer->redirect_learned.a4 &&
- peer->redirect_learned.a4 != rt->rt_gateway) {
- rt->rt_gateway = peer->redirect_learned.a4;
- rt->rt_flags |= RTCF_REDIRECTED;
- }
- } else {
- if (fi->fib_metrics != (u32 *) dst_default_metrics) {
- rt->fi = fi;
- atomic_inc(&fi->fib_clntref);
- }
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+ rt->fi = fi;
+ atomic_inc(&fi->fib_clntref);
}
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
}
static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
const struct fib_result *res,
struct fib_info *fi, u16 type, u32 itag)
{
- struct dst_entry *dst = &rt->dst;
-
if (fi) {
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
rt_init_metrics(rt, fl4, fi);
#ifdef CONFIG_IP_ROUTE_CLASSID
- dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
+ rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
}
- if (dst_mtu(dst) > IP_MAX_MTU)
- dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
-
#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
set_class_tag(rt, fib_rules_tclass(res));
@@ -1964,9 +1782,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_iif = dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = daddr;
- rth->rt_peer_genid = 0;
- rt_init_peer(rth, dev_net(dev)->ipv4.peers);
rth->fi = NULL;
if (our) {
rth->dst.input= ip_local_deliver;
@@ -2090,9 +1907,8 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_iif = in_dev->dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = daddr;
- rth->rt_peer_genid = 0;
- rt_init_peer(rth, &res->table->tb_peers);
rth->fi = NULL;
rth->dst.input = ip_forward;
@@ -2269,9 +2085,8 @@ local_input:
rth->rt_iif = dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = daddr;
- rth->rt_peer_genid = 0;
- rt_init_peer(rth, net->ipv4.peers);
rth->fi = NULL;
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
@@ -2346,7 +2161,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- ipv4_validate_peer(rth);
if (noref) {
dst_use_noref(&rth->dst, jiffies);
skb_dst_set_noref(skb, &rth->dst);
@@ -2468,11 +2282,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth->rt_iif = orig_oif ? : dev_out->ifindex;
rth->rt_oif = orig_oif;
rth->rt_mark = fl4->flowi4_mark;
+ rth->rt_pmtu = 0;
rth->rt_gateway = fl4->daddr;
- rth->rt_peer_genid = 0;
- rt_init_peer(rth, (res->table ?
- &res->table->tb_peers :
- dev_net(dev_out)->ipv4.peers));
rth->fi = NULL;
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2726,7 +2537,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- ipv4_validate_peer(rth);
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
@@ -2790,7 +2600,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
- dst_copy_metrics(new, &ort->dst);
new->dev = ort->dst.dev;
if (new->dev)
@@ -2803,6 +2612,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_iif = ort->rt_iif;
rt->rt_oif = ort->rt_oif;
rt->rt_mark = ort->rt_mark;
+ rt->rt_pmtu = ort->rt_pmtu;
rt->rt_genid = rt_genid(net);
rt->rt_flags = ort->rt_flags;
@@ -2810,7 +2620,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_dst = ort->rt_dst;
rt->rt_src = ort->rt_src;
rt->rt_gateway = ort->rt_gateway;
- rt_transfer_peer(rt, ort);
rt->fi = ort->fi;
if (rt->fi)
atomic_inc(&rt->fi->fib_clntref);
@@ -2848,7 +2657,7 @@ static int rt_fill_info(struct net *net,
struct rtmsg *r;
struct nlmsghdr *nlh;
unsigned long expires = 0;
- u32 id = 0, ts = 0, tsage = 0, error;
+ u32 error;
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
if (nlh == NULL)
@@ -2901,21 +2710,12 @@ static int rt_fill_info(struct net *net,
goto nla_put_failure;
error = rt->dst.error;
- if (rt_has_peer(rt)) {
- const struct inet_peer *peer = rt_peer_ptr(rt);
- inet_peer_refcheck(peer);
- id = atomic_read(&peer->ip_id_count) & 0xffff;
- if (peer->tcp_ts_stamp) {
- ts = peer->tcp_ts;
- tsage = get_seconds() - peer->tcp_ts_stamp;
- }
- expires = ACCESS_ONCE(peer->pmtu_expires);
- if (expires) {
- if (time_before(jiffies, expires))
- expires -= jiffies;
- else
- expires = 0;
- }
+ expires = rt->dst.expires;
+ if (expires) {
+ if (time_before(jiffies, expires))
+ expires -= jiffies;
+ else
+ expires = 0;
}
if (rt_is_input_route(rt)) {
@@ -2944,8 +2744,7 @@ static int rt_fill_info(struct net *net,
goto nla_put_failure;
}
- if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
- expires, error) < 0)
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f60e4e..29aa0c800cd0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3563,6 +3563,8 @@ void __init tcp_init(void)
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+ tcp_metrics_init();
+
tcp_register_congestion_control(&tcp_reno);
memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ca0d0e7c9778..055ac49b8b40 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
-int sysctl_tcp_nometrics_save __read_mostly;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -701,7 +700,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -728,109 +727,6 @@ static inline void tcp_set_rto(struct sock *sk)
tcp_bound_rto(sk);
}
-/* Save metrics learned by this TCP session.
- This function is called only, when TCP finishes successfully
- i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
- */
-void tcp_update_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (sysctl_tcp_nometrics_save)
- return;
-
- if (dst && (dst->flags & DST_HOST)) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int m;
- unsigned long rtt;
-
- dst_confirm(dst);
-
- if (icsk->icsk_backoff || !tp->srtt) {
- /* This session failed to estimate rtt. Why?
- * Probably, no packets returned in time.
- * Reset our results.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT)))
- dst_metric_set(dst, RTAX_RTT, 0);
- return;
- }
-
- rtt = dst_metric_rtt(dst, RTAX_RTT);
- m = rtt - tp->srtt;
-
- /* If newly calculated rtt larger than stored one,
- * store new one. Otherwise, use EWMA. Remember,
- * rtt overestimation is always better than underestimation.
- */
- if (!(dst_metric_locked(dst, RTAX_RTT))) {
- if (m <= 0)
- set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
- else
- set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
- }
-
- if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
- unsigned long var;
- if (m < 0)
- m = -m;
-
- /* Scale deviation to rttvar fixed point */
- m >>= 1;
- if (m < tp->mdev)
- m = tp->mdev;
-
- var = dst_metric_rtt(dst, RTAX_RTTVAR);
- if (m >= var)
- var = m;
- else
- var -= (var - m) >> 2;
-
- set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
- }
-
- if (tcp_in_initial_slowstart(tp)) {
- /* Slow start still did not finish. */
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
- if (!dst_metric_locked(dst, RTAX_CWND) &&
- tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
- } else if (tp->snd_cwnd > tp->snd_ssthresh &&
- icsk->icsk_ca_state == TCP_CA_Open) {
- /* Cong. avoidance phase, cwnd is reliable. */
- if (!dst_metric_locked(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH,
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND,
- (dst_metric(dst, RTAX_CWND) +
- tp->snd_cwnd) >> 1);
- } else {
- /* Else slow start did not finish, cwnd is non-sense,
- ssthresh may be also invalid.
- */
- if (!dst_metric_locked(dst, RTAX_CWND))
- dst_metric_set(dst, RTAX_CWND,
- (dst_metric(dst, RTAX_CWND) +
- tp->snd_ssthresh) >> 1);
- if (dst_metric(dst, RTAX_SSTHRESH) &&
- !dst_metric_locked(dst, RTAX_SSTHRESH) &&
- tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
- dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
- }
-
- if (!dst_metric_locked(dst, RTAX_REORDERING)) {
- if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
- dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
- }
- }
-}
-
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
@@ -867,7 +763,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
* Packet counting of FACK is based on in-order assumptions, therefore TCP
* disables it when reordering is detected
*/
-static void tcp_disable_fack(struct tcp_sock *tp)
+void tcp_disable_fack(struct tcp_sock *tp)
{
/* RFC3517 uses different metric in lost marker => reset on change */
if (tcp_is_fack(tp))
@@ -881,86 +777,6 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
}
-/* Initialize metrics on socket. */
-
-static void tcp_init_metrics(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst == NULL)
- goto reset;
-
- dst_confirm(dst);
-
- if (dst_metric_locked(dst, RTAX_CWND))
- tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
- if (dst_metric(dst, RTAX_SSTHRESH)) {
- tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
- if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
- tp->snd_ssthresh = tp->snd_cwnd_clamp;
- } else {
- /* ssthresh may have been reduced unnecessarily during.
- * 3WHS. Restore it back to its initial default.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- }
- if (dst_metric(dst, RTAX_REORDERING) &&
- tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
- tcp_disable_fack(tp);
- tcp_disable_early_retrans(tp);
- tp->reordering = dst_metric(dst, RTAX_REORDERING);
- }
-
- if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
- goto reset;
-
- /* Initial rtt is determined from SYN,SYN-ACK.
- * The segment is small and rtt may appear much
- * less than real one. Use per-dst memory
- * to make it more realistic.
- *
- * A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal circumstances sending small
- * packets force peer to delay ACKs and calculation is correct too.
- * The algorithm is adaptive and, provided we follow specs, it
- * NEVER underestimate RTT. BUT! If peer tries to make some clever
- * tricks sort of "quick acks" for time long enough to decrease RTT
- * to low value, and then abruptly stops to do it and starts to delay
- * ACKs, wait for troubles.
- */
- if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
- tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
- tp->rtt_seq = tp->snd_nxt;
- }
- if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
- tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
- tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
- }
- tcp_set_rto(sk);
-reset:
- if (tp->srtt == 0) {
- /* RFC6298: 5.7 We've failed to get a valid RTT sample from
- * 3WHS. This is most likely due to retransmission,
- * including spurious one. Reset the RTO back to 3secs
- * from the more aggressive 1sec to avoid more spurious
- * retransmission.
- */
- tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
- }
- /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
- * retransmitted. In light of RFC6298 more aggressive 1sec
- * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
- * retransmission has occurred.
- */
- if (tp->total_retrans > 1)
- tp->snd_cwnd = 1;
- else
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static void tcp_update_reordering(struct sock *sk, const int metric,
const int ts)
{
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 64568fa21d05..ddefd39ac0cf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -209,22 +209,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
if (tcp_death_row.sysctl_tw_recycle &&
- !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
- struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
- /*
- * VJ's idea. We save last timestamp seen from
- * the destination in peer table, when entering state
- * TIME-WAIT * and initialize rx_opt.ts_recent from it,
- * when trying new connection.
- */
- if (peer) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
- }
- }
- }
+ !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+ tcp_fetch_timewait_stamp(sk, &rt->dst);
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
@@ -1375,7 +1361,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
- struct inet_peer *peer = NULL;
struct flowi4 fl4;
/* VJ's idea. We save last timestamp seen
@@ -1390,12 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL &&
- fl4.daddr == saddr &&
- (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
- (s32)(peer->tcp_ts - req->ts_recent) >
- TCP_PAWS_WINDOW) {
+ fl4.daddr == saddr) {
+ if (!tcp_peer_is_proven(req, dst, true)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
@@ -1404,8 +1385,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
- (!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
+ !tcp_peer_is_proven(req, dst, false)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
@@ -1867,21 +1847,6 @@ do_time_wait:
goto discard_it;
}
-struct inet_peer *tcp_v4_get_peer(struct sock *sk)
-{
- struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
- struct inet_sock *inet = inet_sk(sk);
-
- /* If we don't have a valid cached route, or we're doing IP
- * options which make the IPv4 header destination address
- * different from our peer's, do not bother with this.
- */
- if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr)
- return NULL;
- return rt_get_peer_create(rt, inet->inet_daddr);
-}
-EXPORT_SYMBOL(tcp_v4_get_peer);
-
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
.twsk_unique = tcp_twsk_unique,
@@ -1894,7 +1859,6 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
- .get_peer = tcp_v4_get_peer,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000000..1fd83d3118fe
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,697 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+enum tcp_metric_index {
+ TCP_METRIC_RTT,
+ TCP_METRIC_RTTVAR,
+ TCP_METRIC_SSTHRESH,
+ TCP_METRIC_CWND,
+ TCP_METRIC_REORDERING,
+
+ /* Always last. */
+ TCP_METRIC_MAX,
+};
+
+struct tcp_metrics_block {
+ struct tcp_metrics_block __rcu *tcpm_next;
+ struct inetpeer_addr tcpm_addr;
+ unsigned long tcpm_stamp;
+ u32 tcpm_ts;
+ u32 tcpm_ts_stamp;
+ u32 tcpm_lock;
+ u32 tcpm_vals[TCP_METRIC_MAX];
+};
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_vals[idx];
+}
+
+static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return msecs_to_jiffies(tm->tcpm_vals[idx]);
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = val;
+}
+
+static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = jiffies_to_msecs(val);
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ const struct in6_addr *a6, *b6;
+
+ if (a->family != b->family)
+ return false;
+ if (a->family == AF_INET)
+ return a->addr.a4 == b->addr.a4;
+
+ a6 = (const struct in6_addr *) &a->addr.a6[0];
+ b6 = (const struct in6_addr *) &b->addr.a6[0];
+
+ return ipv6_addr_equal(a6, b6);
+}
+
+struct tcpm_hash_bucket {
+ struct tcp_metrics_block __rcu *chain;
+};
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ u32 val;
+
+ val = 0;
+ if (dst_metric_locked(dst, RTAX_RTT))
+ val |= 1 << TCP_METRIC_RTT;
+ if (dst_metric_locked(dst, RTAX_RTTVAR))
+ val |= 1 << TCP_METRIC_RTTVAR;
+ if (dst_metric_locked(dst, RTAX_SSTHRESH))
+ val |= 1 << TCP_METRIC_SSTHRESH;
+ if (dst_metric_locked(dst, RTAX_CWND))
+ val |= 1 << TCP_METRIC_CWND;
+ if (dst_metric_locked(dst, RTAX_REORDERING))
+ val |= 1 << TCP_METRIC_REORDERING;
+ tm->tcpm_lock = val;
+
+ tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
+ tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
+ tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+ tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+ tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tm->tcpm_ts = 0;
+ tm->tcpm_ts_stamp = 0;
+}
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+ struct inetpeer_addr *addr,
+ unsigned int hash,
+ bool reclaim)
+{
+ struct tcp_metrics_block *tm;
+ struct net *net;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ net = dev_net(dst->dev);
+ if (unlikely(reclaim)) {
+ struct tcp_metrics_block *oldest;
+
+ oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
+ for (tm = rcu_dereference(oldest->tcpm_next); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ oldest = tm;
+ }
+ tm = oldest;
+ } else {
+ tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ if (!tm)
+ goto out_unlock;
+ }
+ tm->tcpm_addr = *addr;
+ tm->tcpm_stamp = jiffies;
+
+ tcpm_suck_dst(tm, dst);
+
+ if (likely(!reclaim)) {
+ tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+ }
+
+out_unlock:
+ spin_unlock_bh(&tcp_metrics_lock);
+ return tm;
+}
+
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ tcpm_suck_dst(tm, dst);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+ if (tm)
+ return tm;
+ if (depth > TCP_METRICS_RECLAIM_DEPTH)
+ return TCP_METRICS_RECLAIM_PTR;
+ return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
+ struct net *net, unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ int depth = 0;
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_addr, addr))
+ break;
+ depth++;
+ }
+ return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr addr;
+ unsigned int hash;
+ struct net *net;
+
+ addr.family = req->rsk_ops->family;
+ switch (addr.family) {
+ case AF_INET:
+ addr.addr.a4 = inet_rsk(req)->rmt_addr;
+ hash = (__force unsigned int) addr.addr.a4;
+ break;
+ case AF_INET6:
+ *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
+ hash = ((__force unsigned int) addr.addr.a6[0] ^
+ (__force unsigned int) addr.addr.a6[1] ^
+ (__force unsigned int) addr.addr.a6[2] ^
+ (__force unsigned int) addr.addr.a6[3]);
+ break;
+ default:
+ return NULL;
+ }
+
+ hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+ net = dev_net(dst->dev);
+ hash &= net->ipv4.tcp_metrics_hash_mask;
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_addr, &addr))
+ break;
+ }
+ tcpm_check_stamp(tm, dst);
+ return tm;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+ struct inet6_timewait_sock *tw6;
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr addr;
+ unsigned int hash;
+ struct net *net;
+
+ addr.family = tw->tw_family;
+ switch (addr.family) {
+ case AF_INET:
+ addr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) addr.addr.a4;
+ break;
+ case AF_INET6:
+ tw6 = inet6_twsk((struct sock *)tw);
+ *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
+ hash = ((__force unsigned int) addr.addr.a6[0] ^
+ (__force unsigned int) addr.addr.a6[1] ^
+ (__force unsigned int) addr.addr.a6[2] ^
+ (__force unsigned int) addr.addr.a6[3]);
+ break;
+ default:
+ return NULL;
+ }
+
+ hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+ net = twsk_net(tw);
+ hash &= net->ipv4.tcp_metrics_hash_mask;
+
+ for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_addr, &addr))
+ break;
+ }
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+ struct dst_entry *dst,
+ bool create)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr addr;
+ unsigned int hash;
+ struct net *net;
+ bool reclaim;
+
+ addr.family = sk->sk_family;
+ switch (addr.family) {
+ case AF_INET:
+ addr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) addr.addr.a4;
+ break;
+ case AF_INET6:
+ *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
+ hash = ((__force unsigned int) addr.addr.a6[0] ^
+ (__force unsigned int) addr.addr.a6[1] ^
+ (__force unsigned int) addr.addr.a6[2] ^
+ (__force unsigned int) addr.addr.a6[3]);
+ break;
+ default:
+ return NULL;
+ }
+
+ hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8);
+
+ net = dev_net(dst->dev);
+ hash &= net->ipv4.tcp_metrics_hash_mask;
+
+ tm = __tcp_get_metrics(&addr, net, hash);
+ reclaim = false;
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (!tm && create)
+ tm = tcpm_new(dst, &addr, hash, reclaim);
+ else
+ tcpm_check_stamp(tm, dst);
+
+ return tm;
+}
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ unsigned long rtt;
+ u32 val;
+ int m;
+
+ if (sysctl_tcp_nometrics_save || !dst)
+ return;
+
+ if (dst->flags & DST_HOST)
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ if (icsk->icsk_backoff || !tp->srtt) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time. Reset our
+ * results.
+ */
+ tm = tcp_get_metrics(sk, dst, false);
+ if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+ tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+ goto out_unlock;
+ } else
+ tm = tcp_get_metrics(sk, dst, true);
+
+ if (!tm)
+ goto out_unlock;
+
+ rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one, store new
+ * one. Otherwise, use EWMA. Remember, rtt overestimation is
+ * always better than underestimation.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+ if (m <= 0)
+ rtt = tp->srtt;
+ else
+ rtt -= (m >> 3);
+ tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
+ }
+
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+ unsigned long var;
+
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+ var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && (tp->snd_cwnd >> 1) > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_cwnd >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ if (tp->snd_cwnd > val)
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ tp->snd_cwnd);
+ }
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1);
+ }
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ * ssthresh may be also invalid.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ (val + tp->snd_ssthresh) >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && tp->snd_ssthresh > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_ssthresh);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ tp->reordering);
+ }
+ }
+ tm->tcpm_stamp = jiffies;
+out_unlock:
+ rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ u32 val;
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (!tm) {
+ rcu_read_unlock();
+ goto reset;
+ }
+
+ if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+ tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val) {
+ tp->snd_ssthresh = val;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ }
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val && tp->reordering != val) {
+ tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
+ tp->reordering = val;
+ }
+
+ val = tcp_metric_get(tm, TCP_METRIC_RTT);
+ if (val == 0 || tp->srtt == 0) {
+ rcu_read_unlock();
+ goto reset;
+ }
+ /* Initial rtt is determined from SYN,SYN-ACK.
+ * The segment is small and rtt may appear much
+ * less than real one. Use per-dst memory
+ * to make it more realistic.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ val = msecs_to_jiffies(val);
+ if (val > tp->srtt) {
+ tp->srtt = val;
+ tp->rtt_seq = tp->snd_nxt;
+ }
+ val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
+ if (val > tp->mdev) {
+ tp->mdev = val;
+ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+ }
+ rcu_read_unlock();
+
+ tcp_set_rto(sk);
+reset:
+ if (tp->srtt == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+{
+ struct tcp_metrics_block *tm;
+ bool ret;
+
+ if (!dst)
+ return false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_req(req, dst);
+ if (paws_check) {
+ if (tm &&
+ (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+ (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+ ret = false;
+ else
+ ret = true;
+ } else {
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+ ret = true;
+ else
+ ret = false;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+ tp->rx_opt.ts_recent = tm->tcpm_ts;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ bool ret = false;
+
+ if (dst) {
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+ tm->tcpm_ts = tp->rx_opt.ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+ }
+ return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ bool ret = false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_tw(tw);
+ if (tw) {
+ const struct tcp_timewait_sock *tcptw;
+ struct sock *sk = (struct sock *) tw;
+
+ tcptw = tcp_twsk(sk);
+ if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+ tm->tcpm_ts = tcptw->tw_ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static unsigned long tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtoul(str, 0, &tcpmhash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+ int slots, size;
+
+ slots = tcpmhash_entries;
+ if (!slots) {
+ if (totalram_pages >= 128 * 1024)
+ slots = 16 * 1024;
+ else
+ slots = 8 * 1024;
+ }
+
+ size = slots * sizeof(struct tcpm_hash_bucket);
+
+ net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv4.tcp_metrics_hash)
+ return -ENOMEM;
+
+ net->ipv4.tcp_metrics_hash_mask = (slots - 1);
+
+ return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+ kfree(net->ipv4.tcp_metrics_hash);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+ .init = tcp_net_metrics_init,
+ .exit = tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+ register_pernet_subsys(&tcp_net_metrics_ops);
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 72b7c63b1a39..65608863fdee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,52 +49,6 @@ struct inet_timewait_death_row tcp_death_row = {
};
EXPORT_SYMBOL_GPL(tcp_death_row);
-/* VJ's idea. Save last timestamp seen from this destination
- * and hold it at least for normal timewait interval to use for duplicate
- * segment detection in subsequent connections, before they enter synchronized
- * state.
- */
-
-static bool tcp_remember_stamp(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct inet_peer *peer;
-
- peer = icsk->icsk_af_ops->get_peer(sk);
- if (peer) {
- if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
- ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
- peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
- peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
- peer->tcp_ts = tp->rx_opt.ts_recent;
- }
- return true;
- }
-
- return false;
-}
-
-static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
-{
- const struct tcp_timewait_sock *tcptw;
- struct sock *sk = (struct sock *) tw;
- struct inet_peer *peer;
-
- tcptw = tcp_twsk(sk);
- peer = tcptw->tw_peer;
- if (peer) {
- if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
- ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
- peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
- peer->tcp_ts = tcptw->tw_ts_recent;
- }
- return true;
- }
- return false;
-}
-
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
if (seq == s_win)
@@ -313,12 +267,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
bool recycle_ok = false;
- bool recycle_on = false;
- if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) {
+ if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tcp_remember_stamp(sk);
- recycle_on = true;
- }
if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
tw = inet_twsk_alloc(sk, state);
@@ -327,7 +278,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
struct inet_sock *inet = inet_sk(sk);
- struct inet_peer *peer = NULL;
tw->tw_transparent = inet->transparent;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
@@ -351,12 +301,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
#endif
- if (recycle_on)
- peer = icsk->icsk_af_ops->get_peer(sk);
- tcptw->tw_peer = peer;
- if (peer)
- atomic_inc(&peer->refcnt);
-
#ifdef CONFIG_TCP_MD5SIG
/*
* The timewait bucket does not have the key DB from the
@@ -408,11 +352,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
void tcp_twsk_destructor(struct sock *sk)
{
+#ifdef CONFIG_TCP_MD5SIG
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_peer)
- inet_putpeer(twsk->tw_peer);
-#ifdef CONFIG_TCP_MD5SIG
if (twsk->tw_md5_key) {
tcp_free_md5sig_pool();
kfree_rcu(twsk->tw_md5_key, rcu);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9815ea0bca7f..87d3fcc302d4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -90,8 +90,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.dst.dev = dev;
dev_hold(dev);
- rt_transfer_peer(&xdst->u.rt, rt);
-
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
@@ -100,6 +98,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_src = rt->rt_src;
xdst->u.rt.rt_dst = rt->rt_dst;
xdst->u.rt.rt_gateway = rt->rt_gateway;
+ xdst->u.rt.rt_pmtu = rt->rt_pmtu;
return 0;
}
@@ -209,11 +208,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
dst_destroy_metrics_generic(dst);
- if (rt_has_peer(&xdst->u.rt)) {
- struct inet_peer *peer = rt_peer_ptr(&xdst->u.rt);
- inet_putpeer(peer);
- }
-
xfrm_dst_destroy(xdst);
}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c7da1422cbde..a113f7d7e938 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -194,8 +194,10 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- peer = rt6_get_peer_create(rt);
+ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
res = inet_peer_xrlim_allow(peer, tmo);
+ if (peer)
+ inet_putpeer(peer);
}
dst_release(dst);
return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c6af5963a202..5b2d63ed793e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -466,13 +466,15 @@ int ip6_forward(struct sk_buff *skb)
else
target = &hdr->daddr;
- peer = rt6_get_peer_create(rt);
+ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
*/
if (inet_peer_xrlim_allow(peer, 1*HZ))
ndisc_send_redirect(skb, target);
+ if (peer)
+ inet_putpeer(peer);
} else {
int addrtype = ipv6_addr_type(&hdr->saddr);
@@ -592,10 +594,14 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
int old, new;
if (rt && !(rt->dst.flags & DST_NOPEER)) {
- struct inet_peer *peer = rt6_get_peer_create(rt);
+ struct inet_peer *peer;
+ struct net *net;
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
if (peer) {
fhdr->identification = htonl(inet_getid(peer, 0));
+ inet_putpeer(peer);
return;
}
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 69a6330dea91..0fddd571400d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1486,6 +1486,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
int rd_len;
int err;
u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+ bool ret;
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
@@ -1519,8 +1520,11 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
"Redirect: destination is not a neighbour\n");
goto release;
}
- peer = rt6_get_peer_create(rt);
- if (!inet_peer_xrlim_allow(peer, 1*HZ))
+ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+ ret = inet_peer_xrlim_allow(peer, 1*HZ);
+ if (peer)
+ inet_putpeer(peer);
+ if (!ret)
goto release;
if (dev->addr_len) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6cc6c881f54f..563f12c1c99c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1093,7 +1093,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = oif;
fl6.flowi6_mark = mark;
- fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
+ fl6.flowi6_flags = 0;
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
@@ -2348,13 +2348,11 @@ static int rt6_fill_node(struct net *net,
int iif, int type, u32 pid, u32 seq,
int prefix, int nowait, unsigned int flags)
{
- const struct inet_peer *peer;
struct rtmsg *rtm;
struct nlmsghdr *nlh;
long expires;
u32 table;
struct neighbour *n;
- u32 ts, tsage;
if (prefix) { /* user wants prefix routes only */
if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -2473,17 +2471,7 @@ static int rt6_fill_node(struct net *net,
else
expires = INT_MAX;
- peer = NULL;
- if (rt6_has_peer(rt))
- peer = rt6_peer_ptr(rt);
- ts = tsage = 0;
- if (peer && peer->tcp_ts_stamp) {
- ts = peer->tcp_ts;
- tsage = get_seconds() - peer->tcp_ts_stamp;
- }
-
- if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
- expires, rt->dst.error) < 0)
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6cc67ed6c2e6..61175cb2478f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -277,22 +277,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
rt = (struct rt6_info *) dst;
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp &&
- ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) {
- struct inet_peer *peer = rt6_get_peer(rt);
- /*
- * VJ's idea. We save last timestamp seen from
- * the destination in peer table, when entering state
- * TIME-WAIT * and initialize rx_opt.ts_recent from it,
- * when trying new connection.
- */
- if (peer) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
- }
- }
- }
+ ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr))
+ tcp_fetch_timewait_stamp(sk, dst);
icsk->icsk_ext_hdr_len = 0;
if (np->opt)
@@ -1134,8 +1120,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
treq->iif = inet6_iif(skb);
if (!isn) {
- struct inet_peer *peer = NULL;
-
if (ipv6_opt_accepted(sk, skb) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
@@ -1160,14 +1144,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
*/
if (tmp_opt.saw_tstamp &&
tcp_death_row.sysctl_tw_recycle &&
- (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL &&
- (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL &&
- ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
- &treq->rmt_addr)) {
- inet_peer_refcheck(peer);
- if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
- (s32)(peer->tcp_ts - req->ts_recent) >
- TCP_PAWS_WINDOW) {
+ (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
+ if (!tcp_peer_is_proven(req, dst, true)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
@@ -1176,8 +1154,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
- (!peer || !peer->tcp_ts_stamp) &&
- (!dst || !dst_metric(dst, RTAX_RTT))) {
+ !tcp_peer_is_proven(req, dst, false)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
@@ -1712,20 +1689,6 @@ do_time_wait:
goto discard_it;
}
-static struct inet_peer *tcp_v6_get_peer(struct sock *sk)
-{
- struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- /* If we don't have a valid cached route, or we're doing IP
- * options which make the IPv6 header destination address
- * different from our peer's, do not bother with this.
- */
- if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr))
- return NULL;
- return rt6_get_peer_create(rt);
-}
-
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
.twsk_unique = tcp_twsk_unique,
@@ -1738,7 +1701,6 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
.rebuild_header = inet6_sk_rebuild_header,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
- .get_peer = tcp_v6_get_peer,
.net_header_len = sizeof(struct ipv6hdr),
.net_frag_header_len = sizeof(struct frag_hdr),
.setsockopt = ipv6_setsockopt,
@@ -1770,7 +1732,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
.rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
- .get_peer = tcp_v4_get_peer,
.net_header_len = sizeof(struct iphdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,