aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig1
-rw-r--r--net/ipv4/af_inet.c63
-rw-r--r--net/ipv4/arp.c49
-rw-r--r--net/ipv4/bpf_tcp_ca.c34
-rw-r--r--net/ipv4/cipso_ipv4.c12
-rw-r--r--net/ipv4/datagram.c7
-rw-r--r--net/ipv4/devinet.c17
-rw-r--r--net/ipv4/esp4.c12
-rw-r--r--net/ipv4/esp4_offload.c7
-rw-r--r--net/ipv4/fib_frontend.c49
-rw-r--r--net/ipv4/fib_lookup.h10
-rw-r--r--net/ipv4/fib_rules.c27
-rw-r--r--net/ipv4/fib_semantics.c216
-rw-r--r--net/ipv4/fib_trie.c89
-rw-r--r--net/ipv4/fou.c27
-rw-r--r--net/ipv4/gre_offload.c13
-rw-r--r--net/ipv4/icmp.c188
-rw-r--r--net/ipv4/igmp.c14
-rw-r--r--net/ipv4/inet_connection_sock.c22
-rw-r--r--net/ipv4/inet_diag.c9
-rw-r--r--net/ipv4/inet_fragment.c11
-rw-r--r--net/ipv4/inet_hashtables.c197
-rw-r--r--net/ipv4/inet_timewait_sock.c18
-rw-r--r--net/ipv4/inetpeer.c12
-rw-r--r--net/ipv4/ip_forward.c15
-rw-r--r--net/ipv4/ip_fragment.c4
-rw-r--r--net/ipv4/ip_gre.c91
-rw-r--r--net/ipv4/ip_input.c33
-rw-r--r--net/ipv4/ip_options.c31
-rw-r--r--net/ipv4/ip_output.c49
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ip_tunnel.c9
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ipmr.c32
-rw-r--r--net/ipv4/netfilter.c3
-rw-r--r--net/ipv4/netfilter/Kconfig8
-rw-r--r--net/ipv4/netfilter/Makefile3
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c11
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c37
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c24
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/nexthop.c26
-rw-r--r--net/ipv4/ping.c76
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c34
-rw-r--r--net/ipv4/route.c207
-rw-r--r--net/ipv4/syncookies.c12
-rw-r--r--net/ipv4/sysctl_net_ipv4.c55
-rw-r--r--net/ipv4/tcp.c236
-rw-r--r--net/ipv4/tcp_bbr.c40
-rw-r--r--net/ipv4/tcp_bic.c14
-rw-r--r--net/ipv4/tcp_bpf.c59
-rw-r--r--net/ipv4/tcp_cdg.c30
-rw-r--r--net/ipv4/tcp_cong.c32
-rw-r--r--net/ipv4/tcp_cubic.c43
-rw-r--r--net/ipv4/tcp_dctcp.c29
-rw-r--r--net/ipv4/tcp_highspeed.c18
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_hybla.c18
-rw-r--r--net/ipv4/tcp_illinois.c12
-rw-r--r--net/ipv4/tcp_input.c262
-rw-r--r--net/ipv4/tcp_ipv4.c242
-rw-r--r--net/ipv4/tcp_lp.c6
-rw-r--r--net/ipv4/tcp_metrics.c12
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_nv.c24
-rw-r--r--net/ipv4/tcp_offload.c1
-rw-r--r--net/ipv4/tcp_output.c115
-rw-r--r--net/ipv4/tcp_rate.c13
-rw-r--r--net/ipv4/tcp_recovery.c15
-rw-r--r--net/ipv4/tcp_scalable.c4
-rw-r--r--net/ipv4/tcp_vegas.c21
-rw-r--r--net/ipv4/tcp_veno.c24
-rw-r--r--net/ipv4/tcp_westwood.c3
-rw-r--r--net/ipv4/tcp_yeah.c30
-rw-r--r--net/ipv4/udp.c74
-rw-r--r--net/ipv4/udp_bpf.c17
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/udp_offload.c32
-rw-r--r--net/ipv4/udp_tunnel_nic.c2
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_protocol.c1
86 files changed, 1844 insertions, 1512 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 87983e70f03f..e983bb0c5012 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -321,7 +321,6 @@ config NET_UDP_TUNNEL
config NET_FOU
tristate "IP: Foo (IP protocols) over UDP"
- select XFRM
select NET_UDP_TUNNEL
help
Foo over UDP allows any IP protocol to be directly encapsulated
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0189e3cd4a7d..ac67f6b4ec70 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
+#include <net/gro.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
@@ -154,7 +155,7 @@ void inet_sock_destruct(struct sock *sk)
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
- dst_release(sk->sk_rx_dst);
+ dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -224,7 +225,7 @@ int inet_listen(struct socket *sock, int backlog)
tcp_fastopen_init_key_once(sock_net(sk));
}
- err = inet_csk_listen_start(sk, backlog);
+ err = inet_csk_listen_start(sk);
if (err)
goto out;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
@@ -488,11 +489,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!inet_can_nonlocal_bind(net, inet) &&
- addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
snum = ntohs(addr->sin_port);
@@ -533,6 +531,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ if (sk->sk_prot->put_port)
+ sk->sk_prot->put_port(sk);
goto out_release_sock;
}
}
@@ -836,7 +836,7 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
EXPORT_SYMBOL(inet_sendpage);
INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
- size_t, int, int, int *));
+ size_t, int, int *));
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -848,8 +848,7 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
sock_rps_record_flow(sk);
err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
- sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -1234,9 +1233,9 @@ static int inet_sk_reselect_saddr(struct sock *sk)
/* Query new route. */
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if, sk->sk_protocol,
- inet->inet_sport, inet->inet_dport, sk);
+ rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
+ sk->sk_protocol, inet->inet_sport,
+ inet->inet_dport, sk);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -1247,7 +1246,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (new_saddr == old_saddr)
return 0;
- if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
@@ -1302,7 +1301,7 @@ int inet_sk_rebuild_header(struct sock *sk)
* Other protocols have to map its equivalent state to TCP_SYN_SENT.
* DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
*/
- if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
(err = inet_sk_reselect_saddr(sk)) != 0)
@@ -1376,8 +1375,11 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
}
ops = rcu_dereference(inet_offloads[proto]);
- if (likely(ops && ops->callbacks.gso_segment))
+ if (likely(ops && ops->callbacks.gso_segment)) {
segs = ops->callbacks.gso_segment(skb, features);
+ if (!segs)
+ skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
+ }
if (IS_ERR_OR_NULL(segs))
goto out;
@@ -1454,19 +1456,18 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
proto = iph->protocol;
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
if (*(u8 *)iph != 0x45)
- goto out_unlock;
+ goto out;
if (ip_is_fragment(iph))
- goto out_unlock;
+ goto out;
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
- goto out_unlock;
+ goto out;
id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
@@ -1543,9 +1544,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
ops->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
-
out:
skb_gro_flush_final(skb, pp, flush);
@@ -1618,10 +1616,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
/* Only need to add sizeof(*iph) to get to the next hdr below
* because any hdr with option will have been flushed in
@@ -1631,9 +1628,7 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
tcp4_gro_complete, udp4_gro_complete,
skb, nhoff + sizeof(*iph));
-out_unlock:
- rcu_read_unlock();
-
+out:
return err;
}
@@ -1994,6 +1989,10 @@ static int __init inet_init(void)
ip_init();
+ /* Initialise per-cpu ipv4 mibs */
+ if (init_ipv4_mibs())
+ panic("%s: Cannot init ipv4 mibs\n", __func__);
+
/* Setup TCP slab cache for open requests. */
tcp_init();
@@ -2024,12 +2023,6 @@ static int __init inet_init(void)
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
- /*
- * Initialise per-cpu ipv4 mibs
- */
-
- if (init_ipv4_mibs())
- pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 857a144b1ea9..ab4a5601c82a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -293,7 +293,7 @@ static int arp_constructor(struct neighbour *neigh)
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
dst_link_failure(skb);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
}
/* Create and send an arp packet. */
@@ -1116,13 +1116,18 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
return err;
}
-static int arp_invalidate(struct net_device *dev, __be32 ip)
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
{
struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
int err = -ENXIO;
struct neigh_table *tbl = &arp_tbl;
if (neigh) {
+ if ((neigh->nud_state & NUD_VALID) && !force) {
+ neigh_release(neigh);
+ return 0;
+ }
+
if (neigh->nud_state & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
@@ -1169,7 +1174,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
if (!dev)
return -EINVAL;
}
- return arp_invalidate(dev, ip);
+ return arp_invalidate(dev, ip, true);
}
/*
@@ -1299,24 +1304,9 @@ static struct packet_type arp_packet_type __read_mostly = {
.func = arp_rcv,
};
-static int arp_proc_init(void);
-
-void __init arp_init(void)
-{
- neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-
- dev_add_pack(&arp_packet_type);
- arp_proc_init();
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
-#endif
- register_netdevice_notifier(&arp_netdev_notifier);
-}
-
#ifdef CONFIG_PROC_FS
#if IS_ENABLED(CONFIG_AX25)
-/* ------------------------------------------------------------------------ */
/*
* ax25 -> ASCII conversion
*/
@@ -1422,16 +1412,13 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
}
-/* ------------------------------------------------------------------------ */
-
static const struct seq_operations arp_seq_ops = {
.start = arp_seq_start,
.next = neigh_seq_next,
.stop = neigh_seq_stop,
.show = arp_seq_show,
};
-
-/* ------------------------------------------------------------------------ */
+#endif /* CONFIG_PROC_FS */
static int __net_init arp_net_init(struct net *net)
{
@@ -1451,16 +1438,14 @@ static struct pernet_operations arp_net_ops = {
.exit = arp_net_exit,
};
-static int __init arp_proc_init(void)
+void __init arp_init(void)
{
- return register_pernet_subsys(&arp_net_ops);
-}
-
-#else /* CONFIG_PROC_FS */
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-static int __init arp_proc_init(void)
-{
- return 0;
+ dev_add_pack(&arp_packet_type);
+ register_pernet_subsys(&arp_net_ops);
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
}
-
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 4bb9401b0a3f..f79ab942f03b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
+#include <linux/init.h>
#include <linux/types.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
@@ -95,12 +96,14 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
- u32 *next_btf_id)
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
{
size_t end;
if (atype == BPF_READ)
- return btf_struct_access(log, btf, t, off, size, atype, next_btf_id);
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
@@ -169,7 +172,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog)
t = bpf_tcp_congestion_ops.type;
m = &btf_type_member(t)[midx];
- return btf_member_bit_offset(t, m) / 8;
+ return __btf_member_bit_offset(t, m) / 8;
}
static const struct bpf_func_proto *
@@ -212,26 +215,23 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
}
}
-BTF_SET_START(bpf_tcp_ca_kfunc_ids)
+BTF_SET_START(bpf_tcp_ca_check_kfunc_ids)
BTF_ID(func, tcp_reno_ssthresh)
BTF_ID(func, tcp_reno_cong_avoid)
BTF_ID(func, tcp_reno_undo_cwnd)
BTF_ID(func, tcp_slow_start)
BTF_ID(func, tcp_cong_avoid_ai)
-BTF_SET_END(bpf_tcp_ca_kfunc_ids)
+BTF_SET_END(bpf_tcp_ca_check_kfunc_ids)
-static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner)
-{
- if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id))
- return true;
- return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner);
-}
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &bpf_tcp_ca_check_kfunc_ids,
+};
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
.get_func_proto = bpf_tcp_ca_get_func_proto,
.is_valid_access = bpf_tcp_ca_is_valid_access,
.btf_struct_access = bpf_tcp_ca_btf_struct_access,
- .check_kfunc_call = bpf_tcp_ca_check_kfunc_call,
};
static int bpf_tcp_ca_init_member(const struct btf_type *t,
@@ -246,7 +246,7 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
utcp_ca = (const struct tcp_congestion_ops *)udata;
tcp_ca = (struct tcp_congestion_ops *)kdata;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
switch (moff) {
case offsetof(struct tcp_congestion_ops, flags):
if (utcp_ca->flags & ~TCP_CONG_MASK)
@@ -276,7 +276,7 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
static int bpf_tcp_ca_check_member(const struct btf_type *t,
const struct btf_member *member)
{
- if (is_unsupported(btf_member_bit_offset(t, member) / 8))
+ if (is_unsupported(__btf_member_bit_offset(t, member) / 8))
return -ENOTSUPP;
return 0;
}
@@ -300,3 +300,9 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
.init = bpf_tcp_ca_init,
.name = "tcp_congestion_ops",
};
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 62d5f99760aa..6cd3b6c559f0 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -239,7 +239,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
struct cipso_v4_map_cache_entry *prev_entry = NULL;
u32 hash;
- if (!cipso_v4_cache_enabled)
+ if (!READ_ONCE(cipso_v4_cache_enabled))
return -ENOENT;
hash = cipso_v4_map_cache_hash(key, key_len);
@@ -296,13 +296,14 @@ static int cipso_v4_cache_check(const unsigned char *key,
int cipso_v4_cache_add(const unsigned char *cipso_ptr,
const struct netlbl_lsm_secattr *secattr)
{
+ int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize);
int ret_val = -EPERM;
u32 bkt;
struct cipso_v4_map_cache_entry *entry = NULL;
struct cipso_v4_map_cache_entry *old_entry = NULL;
u32 cipso_ptr_len;
- if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0)
return 0;
cipso_ptr_len = cipso_ptr[1];
@@ -322,7 +323,7 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr,
bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
- if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ if (cipso_v4_cache[bkt].size < bkt_size) {
list_add(&entry->list, &cipso_v4_cache[bkt].list);
cipso_v4_cache[bkt].size += 1;
} else {
@@ -1199,7 +1200,8 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
/* This will send packets using the "optimized" format when
* possible as specified in section 3.4.2.6 of the
* CIPSO draft. */
- if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 &&
+ ret_val <= 10)
tag_len = 14;
else
tag_len = 4 + ret_val;
@@ -1603,7 +1605,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
* all the CIPSO validations here but it doesn't
* really specify _exactly_ what we need to validate
* ... so, just make it a sysctl tunable. */
- if (cipso_v4_rbm_strictvalid) {
+ if (READ_ONCE(cipso_v4_rbm_strictvalid)) {
if (cipso_v4_map_lvl_valid(doi_def,
tag[3]) < 0) {
err_offset = opt_iter + 3;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 48f337ccf949..ffd57523331f 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -44,10 +44,9 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
saddr = inet->mc_addr;
}
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
- RT_CONN_FLAGS(sk), oif,
- sk->sk_protocol,
- inet->inet_sport, usin->sin_port, sk);
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
+ sk->sk_protocol, inet->inet_sport,
+ usin->sin_port, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 323e622ff9b7..b2366ad540e6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -104,6 +104,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_FLAGS] = { .type = NLA_U32 },
[IFA_RT_PRIORITY] = { .type = NLA_U32 },
[IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFA_PROTO] = { .type = NLA_U8 },
};
struct inet_fill_args {
@@ -243,7 +244,7 @@ void in_dev_finish_destroy(struct in_device *idev)
#ifdef NET_REFCNT_DEBUG
pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
- dev_put(dev);
+ dev_put_track(dev, &idev->dev_tracker);
if (!idev->dead)
pr_err("Freeing alive in_device %p\n", idev);
else
@@ -271,7 +272,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
dev_disable_lro(dev);
/* Reference in_dev->dev */
- dev_hold(dev);
+ dev_hold_track(dev, &in_dev->dev_tracker, GFP_KERNEL);
/* Account for reference dev->ip_ptr (below) */
refcount_set(&in_dev->refcnt, 1);
@@ -535,10 +536,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
return ret;
}
- if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
- prandom_seed((__force u32) ifa->ifa_local);
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
ifap = last_primary;
- }
rcu_assign_pointer(ifa->ifa_next, *ifap);
rcu_assign_pointer(*ifap, ifa);
@@ -889,6 +888,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
if (tb[IFA_RT_PRIORITY])
ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+ if (tb[IFA_PROTO])
+ ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
@@ -1625,6 +1627,7 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(1) /* IFA_PROTO */
+ nla_total_size(4) /* IFA_RT_PRIORITY */
+ nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
@@ -1699,6 +1702,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ (ifa->ifa_proto &&
+ nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
(ifa->ifa_rt_priority &&
nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
@@ -2566,7 +2571,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
struct devinet_sysctl_table *t;
char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
- t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT);
if (!t)
goto out;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 851f542928a3..b21238df3301 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -455,6 +455,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
return err;
}
+ if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+ ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ goto cow;
+
if (!skb_cloned(skb)) {
if (tailen <= skb_tailroom(skb)) {
nfrags = 1;
@@ -671,7 +675,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
- padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached));
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -701,7 +705,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
static inline int esp_remove_trailer(struct sk_buff *skb)
{
struct xfrm_state *x = xfrm_input_state(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
int alen, hlen, elen;
int padlen, trimlen;
@@ -713,11 +716,6 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
elen = skb->len - hlen;
- if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
- ret = xo->proto;
- goto out;
- }
-
if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
BUG();
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 8e4e9aa12130..935026f4c807 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -16,6 +16,7 @@
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <net/gro.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
@@ -109,8 +110,7 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
{
- __skb_push(skb, skb->mac_len);
- return skb_mac_gso_segment(skb, features);
+ return skb_eth_gso_segment(skb, features, htons(ETH_P_IP));
}
static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
@@ -159,6 +159,9 @@ static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
}
+ if (proto == IPPROTO_IPV6)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+
__skb_pull(skb, skb_transport_offset(skb));
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d61ddd8a0ec..f361d3d56be2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -290,7 +291,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
- .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
+ .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
.flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
.flowi4_scope = scope,
@@ -352,9 +353,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
- if (!fl4.flowi4_iif)
- fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
fl4.flowi4_tos = tos;
@@ -436,6 +436,9 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (net->ipv4.fib_has_custom_local_routes ||
fib4_has_custom_rules(net))
goto full_check;
+ /* Within the same container, it is regarded as a martian source,
+ * and the same host but different containers are not.
+ */
if (inet_lookup_ifaddr_rcu(net, src))
return -EINVAL;
@@ -735,8 +738,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
memset(cfg, 0, sizeof(*cfg));
rtm = nlmsg_data(nlh);
+
+ if (!inet_validate_dscp(rtm->rtm_tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ err = -EINVAL;
+ goto errout;
+ }
+ cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_tos = rtm->rtm_tos;
cfg->fc_table = rtm->rtm_table;
cfg->fc_protocol = rtm->rtm_protocol;
cfg->fc_scope = rtm->rtm_scope;
@@ -1112,9 +1123,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
return;
/* Add broadcast address, if it is explicitly assigned. */
- if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
prim, 0);
+ arp_invalidate(dev, ifa->ifa_broadcast, false);
+ }
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
@@ -1128,6 +1141,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
32, prim, 0);
+ arp_invalidate(dev, prefix | ~mask, false);
}
}
}
@@ -1370,7 +1384,7 @@ static void nl_fib_input(struct sk_buff *skb)
return;
nlh = nlmsg_hdr(skb);
- frn = (struct fib_result_nl *) nlmsg_data(nlh);
+ frn = nlmsg_data(nlh);
nl_fib_lookup(net, frn);
portid = NETLINK_CB(skb).portid; /* netlink portid */
@@ -1411,7 +1425,7 @@ static void fib_disable_ip(struct net_device *dev, unsigned long event,
static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct in_ifaddr *ifa = ptr;
struct net_device *dev = ifa->ifa_dev->dev;
struct net *net = dev_net(dev);
@@ -1547,7 +1561,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
- rtnl_lock();
+ ASSERT_RTNL();
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1572,7 +1586,7 @@ static void ip_fib_net_exit(struct net *net)
#ifdef CONFIG_IP_MULTIPLE_TABLES
fib4_rules_exit(net);
#endif
- rtnl_unlock();
+
kfree(net->ipv4.fib_table_hash);
fib4_notifier_exit(net);
}
@@ -1599,7 +1613,9 @@ out:
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
+ rtnl_lock();
ip_fib_net_exit(net);
+ rtnl_unlock();
goto out;
}
@@ -1607,12 +1623,23 @@ static void __net_exit fib_net_exit(struct net *net)
{
fib_proc_exit(net);
nl_fib_lookup_exit(net);
- ip_fib_net_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip_fib_net_exit(net);
+
+ rtnl_unlock();
}
static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
+ .exit_batch = fib_net_exit_batch,
};
void __init ip_fib_init(void)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index e184bcb19943..f9b9e26c32c1 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -4,22 +4,22 @@
#include <linux/types.h>
#include <linux/list.h>
+#include <net/inet_dscp.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
struct fib_alias {
struct hlist_node fa_list;
struct fib_info *fa_info;
- u8 fa_tos;
+ dscp_t fa_dscp;
u8 fa_type;
u8 fa_state;
u8 fa_slen;
u32 tb_id;
s16 fa_default;
- u8 offload:1,
- trap:1,
- offload_failed:1,
- unused:5;
+ u8 offload;
+ u8 trap;
+ u8 offload_failed;
struct rcu_head rcu;
};
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index d279cb8ac158..513f475c6a53 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -35,7 +36,7 @@ struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
- u8 tos;
+ dscp_t dscp;
__be32 src;
__be32 srcmask;
__be32 dst;
@@ -49,7 +50,7 @@ static bool fib4_rule_matchall(const struct fib_rule *rule)
{
struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
- if (r->dst_len || r->src_len || r->tos)
+ if (r->dst_len || r->src_len || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -144,7 +145,7 @@ INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule,
int flags,
struct fib_lookup_arg *arg)
{
- struct fib_result *result = (struct fib_result *) arg->result;
+ struct fib_result *result = arg->result;
struct net_device *dev = NULL;
if (result->fi) {
@@ -185,7 +186,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl4->flowi4_tos))
+ if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
@@ -216,11 +217,6 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
-static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
- FRA_GENERIC_POLICY,
- [FRA_FLOW] = { .type = NLA_U32 },
-};
-
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
@@ -230,10 +226,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ /* IPv4 currently doesn't handle high order DSCP bits correctly */
if (frh->tos & ~IPTOS_TOS_MASK) {
NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
}
+ rule4->dscp = inet_dsfield_to_dscp(frh->tos);
/* split local/main if they are not already split */
err = fib_unmerge(net);
@@ -275,7 +278,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->srcmask = inet_make_mask(rule4->src_len);
rule4->dst_len = frh->dst_len;
rule4->dstmask = inet_make_mask(rule4->dst_len);
- rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
@@ -318,7 +320,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
- if (frh->tos && (rule4->tos != frh->tos))
+ if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
return 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -342,7 +344,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
- frh->tos = rule4->tos;
+ frh->tos = inet_dscp_to_dsfield(rule4->dscp);
if ((rule4->dst_len &&
nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
@@ -386,7 +388,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
- .policy = fib4_rule_policy,
.owner = THIS_MODULE,
};
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index fde7797b5806..d9fdcbae16ee 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,8 +29,10 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/netlink.h>
+#include <linux/hash.h>
#include <net/arp.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -51,6 +53,7 @@ static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
static unsigned int fib_info_hash_size;
+static unsigned int fib_info_hash_bits;
static unsigned int fib_info_cnt;
#define DEVINDEX_HASHBITS 8
@@ -208,7 +211,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
void fib_nh_common_release(struct fib_nh_common *nhc)
{
- dev_put(nhc->nhc_dev);
+ dev_put_track(nhc->nhc_dev, &nhc->nhc_dev_tracker);
lwtstate_put(nhc->nhc_lwtstate);
rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
rt_fibinfo_free(&nhc->nhc_rth_input);
@@ -249,7 +252,6 @@ void free_fib_info(struct fib_info *fi)
pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
- fib_info_cnt--;
call_rcu(&fi->rcu, free_fib_info_rcu);
}
@@ -260,6 +262,10 @@ void fib_release_info(struct fib_info *fi)
spin_lock_bh(&fib_info_lock);
if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
hlist_del(&fi->fib_hash);
+
+ /* Paired with READ_ONCE() in fib_create_info(). */
+ WRITE_ONCE(fib_info_cnt, fib_info_cnt - 1);
+
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
if (fi->nh) {
@@ -316,11 +322,15 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
static inline unsigned int fib_devindex_hashfn(unsigned int val)
{
- unsigned int mask = DEVINDEX_HASHSIZE - 1;
+ return hash_32(val, DEVINDEX_HASHBITS);
+}
+
+static struct hlist_head *
+fib_info_devhash_bucket(const struct net_device *dev)
+{
+ u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex;
- return (val ^
- (val >> DEVINDEX_HASHBITS) ^
- (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+ return &fib_info_devhash[fib_devindex_hashfn(val)];
}
static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
@@ -430,12 +440,11 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
struct fib_nh *nh;
- unsigned int hash;
spin_lock(&fib_info_lock);
- hash = fib_devindex_hashfn(dev->ifindex);
- head = &fib_info_devhash[hash];
+ head = fib_info_devhash_bucket(dev);
+
hlist_for_each_entry(nh, head, nh_hash) {
if (nh->fib_nh_dev == dev &&
nh->fib_nh_gw4 == gw &&
@@ -515,11 +524,11 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
fri.tb_id = tb_id;
fri.dst = key;
fri.dst_len = dst_len;
- fri.tos = fa->fa_tos;
+ fri.dscp = fa->fa_dscp;
fri.type = fa->fa_type;
- fri.offload = fa->offload;
- fri.trap = fa->trap;
- fri.offload_failed = fa->offload_failed;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed = READ_ONCE(fa->offload_failed);
err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -662,6 +671,19 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
return nhs;
}
+static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ if (nla_len(nla) < sizeof(*gw)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ *gw = nla_get_in_addr(nla);
+
+ return 0;
+}
+
/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg,
@@ -704,7 +726,11 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
return -EINVAL;
}
if (nla) {
- fib_cfg.fc_gw4 = nla_get_in_addr(nla);
+ ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla,
+ extack);
+ if (ret)
+ goto errout;
+
if (fib_cfg.fc_gw4)
fib_cfg.fc_gw_family = AF_INET;
} else if (nlav) {
@@ -714,10 +740,18 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
}
nla = nla_find(attrs, attrlen, RTA_FLOW);
- if (nla)
+ if (nla) {
+ if (nla_len(nla) < sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+ return -EINVAL;
+ }
fib_cfg.fc_flow = nla_get_u32(nla);
+ }
fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ /* RTA_ENCAP_TYPE length checked in
+ * lwtunnel_valid_encap_type_attr
+ */
nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla)
fib_cfg.fc_encap_type = nla_get_u16(nla);
@@ -855,8 +889,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
}
if (cfg->fc_oif || cfg->fc_gw_family) {
- struct fib_nh *nh = fib_info_nh(fi, 0);
+ struct fib_nh *nh;
+
+ /* cannot match on nexthop object attributes */
+ if (fi->nh)
+ return 1;
+ nh = fib_info_nh(fi, 0);
if (cfg->fc_encap) {
if (fib_encap_match(net, cfg->fc_encap_type,
cfg->fc_encap, nh, cfg, extack))
@@ -902,6 +941,7 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
+ int err;
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
nlav = nla_find(attrs, attrlen, RTA_VIA);
@@ -912,12 +952,17 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
}
if (nla) {
+ __be32 gw;
+
+ err = fib_gw_from_attr(&gw, nla, extack);
+ if (err)
+ return err;
+
if (nh->fib_nh_gw_family != AF_INET ||
- nla_get_in_addr(nla) != nh->fib_nh_gw4)
+ gw != nh->fib_nh_gw4)
return 1;
} else if (nlav) {
struct fib_config cfg2;
- int err;
err = fib_gw_from_via(&cfg2, nlav, extack);
if (err)
@@ -940,8 +985,14 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
- if (nla && nla_get_u32(nla) != nh->nh_tclassid)
- return 1;
+ if (nla) {
+ if (nla_len(nla) < sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+ return -EINVAL;
+ }
+ if (nla_get_u32(nla) != nh->nh_tclassid)
+ return 1;
+ }
#endif
}
@@ -1006,7 +1057,7 @@ static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
if (!err) {
nh->fib_nh_dev = fib6_nh.fib_nh_dev;
- dev_hold(nh->fib_nh_dev);
+ dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL);
nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
nh->fib_nh_scope = RT_SCOPE_LINK;
@@ -1090,7 +1141,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
if (!netif_carrier_ok(dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
nh->fib_nh_dev = dev;
- dev_hold(dev);
+ dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
nh->fib_nh_scope = RT_SCOPE_LINK;
return 0;
}
@@ -1144,7 +1195,7 @@ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
"No egress device for nexthop gateway");
goto out;
}
- dev_hold(dev);
+ dev_hold_track(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
if (!netif_carrier_ok(dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
@@ -1178,8 +1229,8 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
}
nh->fib_nh_dev = in_dev->dev;
- dev_hold(nh->fib_nh_dev);
- nh->fib_nh_scope = RT_SCOPE_HOST;
+ dev_hold_track(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ nh->fib_nh_scope = RT_SCOPE_LINK;
if (!netif_carrier_ok(nh->fib_nh_dev))
nh->fib_nh_flags |= RTNH_F_LINKDOWN;
err = 0;
@@ -1203,34 +1254,13 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
return err;
}
-static inline unsigned int fib_laddr_hashfn(__be32 val)
-{
- unsigned int mask = (fib_info_hash_size - 1);
-
- return ((__force u32)val ^
- ((__force u32)val >> 7) ^
- ((__force u32)val >> 14)) & mask;
-}
-
-static struct hlist_head *fib_info_hash_alloc(int bytes)
-{
- if (bytes <= PAGE_SIZE)
- return kzalloc(bytes, GFP_KERNEL);
- else
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(bytes));
-}
-
-static void fib_info_hash_free(struct hlist_head *hash, int bytes)
+static struct hlist_head *
+fib_info_laddrhash_bucket(const struct net *net, __be32 val)
{
- if (!hash)
- return;
+ u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val,
+ fib_info_hash_bits);
- if (bytes <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long) hash, get_order(bytes));
+ return &fib_info_laddrhash[slot];
}
static void fib_info_hash_move(struct hlist_head *new_info_hash,
@@ -1239,12 +1269,13 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
{
struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_info_hash_size;
- unsigned int i, bytes;
+ unsigned int i;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
fib_info_hash_size = new_size;
+ fib_info_hash_bits = ilog2(new_size);
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
@@ -1262,27 +1293,25 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash,
}
fib_info_hash = new_info_hash;
+ fib_info_laddrhash = new_laddrhash;
for (i = 0; i < old_size; i++) {
- struct hlist_head *lhead = &fib_info_laddrhash[i];
+ struct hlist_head *lhead = &old_laddrhash[i];
struct hlist_node *n;
struct fib_info *fi;
hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
struct hlist_head *ldest;
- unsigned int new_hash;
- new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
- ldest = &new_laddrhash[new_hash];
+ ldest = fib_info_laddrhash_bucket(fi->fib_net,
+ fi->fib_prefsrc);
hlist_add_head(&fi->fib_lhash, ldest);
}
}
- fib_info_laddrhash = new_laddrhash;
spin_unlock_bh(&fib_info_lock);
- bytes = old_size * sizeof(struct hlist_head *);
- fib_info_hash_free(old_info_hash, bytes);
- fib_info_hash_free(old_laddrhash, bytes);
+ kvfree(old_info_hash);
+ kvfree(old_laddrhash);
}
__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
@@ -1393,23 +1422,25 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
#endif
err = -ENOBUFS;
- if (fib_info_cnt >= fib_info_hash_size) {
+
+ /* Paired with WRITE_ONCE() in fib_release_info() */
+ if (READ_ONCE(fib_info_cnt) >= fib_info_hash_size) {
unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
- unsigned int bytes;
+ size_t bytes;
if (!new_size)
new_size = 16;
- bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_info_hash_alloc(bytes);
- new_laddrhash = fib_info_hash_alloc(bytes);
+ bytes = (size_t)new_size * sizeof(struct hlist_head *);
+ new_info_hash = kvzalloc(bytes, GFP_KERNEL);
+ new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
if (!new_info_hash || !new_laddrhash) {
- fib_info_hash_free(new_info_hash, bytes);
- fib_info_hash_free(new_laddrhash, bytes);
- } else
+ kvfree(new_info_hash);
+ kvfree(new_laddrhash);
+ } else {
fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
-
+ }
if (!fib_info_hash_size)
goto failure;
}
@@ -1425,7 +1456,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
return ERR_PTR(err);
}
- fib_info_cnt++;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
fi->fib_scope = cfg->fc_scope;
@@ -1508,6 +1538,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
err = -ENODEV;
if (!nh->fib_nh_dev)
goto failure;
+ netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+ GFP_KERNEL);
} else {
int linkdown = 0;
@@ -1552,12 +1584,13 @@ link_it:
refcount_set(&fi->fib_treeref, 1);
refcount_set(&fi->fib_clntref, 1);
spin_lock_bh(&fib_info_lock);
+ fib_info_cnt++;
hlist_add_head(&fi->fib_hash,
&fib_info_hash[fib_info_hashfn(fi)]);
if (fi->fib_prefsrc) {
struct hlist_head *head;
- head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc);
hlist_add_head(&fi->fib_lhash, head);
}
if (fi->nh) {
@@ -1565,12 +1598,10 @@ link_it:
} else {
change_nexthops(fi) {
struct hlist_head *head;
- unsigned int hash;
if (!nexthop_nh->fib_nh_dev)
continue;
- hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex);
- head = &fib_info_devhash[hash];
+ head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev);
hlist_add_head(&nexthop_nh->nh_hash, head);
} endfor_nexthops(fi)
}
@@ -1750,7 +1781,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtm->rtm_family = AF_INET;
rtm->rtm_dst_len = fri->dst_len;
rtm->rtm_src_len = 0;
- rtm->rtm_tos = fri->tos;
+ rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp);
if (tb_id < 256)
rtm->rtm_table = tb_id;
else
@@ -1780,7 +1811,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
if (nexthop_is_blackhole(fi->nh))
rtm->rtm_type = RTN_BLACKHOLE;
- if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode)
+ if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
goto offload;
}
@@ -1831,16 +1862,16 @@ nla_put_failure:
*/
int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
- int ret = 0;
- unsigned int hash = fib_laddr_hashfn(local);
- struct hlist_head *head = &fib_info_laddrhash[hash];
int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
struct net *net = dev_net(dev);
+ struct hlist_head *head;
struct fib_info *fi;
+ int ret = 0;
if (!fib_info_laddrhash || local == 0)
return 0;
+ head = fib_info_laddrhash_bucket(net, local);
hlist_for_each_entry(fi, head, fib_lhash) {
if (!net_eq(fi->fib_net, net) ||
fi->fib_tb_id != tb_id)
@@ -1922,8 +1953,7 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
{
- unsigned int hash = fib_devindex_hashfn(dev->ifindex);
- struct hlist_head *head = &fib_info_devhash[hash];
+ struct hlist_head *head = fib_info_devhash_bucket(dev);
struct fib_nh *nh;
hlist_for_each_entry(nh, head, nh_hash) {
@@ -1942,12 +1972,11 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
*/
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
- int ret = 0;
- int scope = RT_SCOPE_NOWHERE;
+ struct hlist_head *head = fib_info_devhash_bucket(dev);
struct fib_info *prev_fi = NULL;
- unsigned int hash = fib_devindex_hashfn(dev->ifindex);
- struct hlist_head *head = &fib_info_devhash[hash];
+ int scope = RT_SCOPE_NOWHERE;
struct fib_nh *nh;
+ int ret = 0;
if (force)
scope = -1;
@@ -2016,7 +2045,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
int order = -1, last_idx = -1;
struct fib_alias *fa, *fa1 = NULL;
u32 last_prio = res->fi->fib_priority;
- u8 last_tos = 0;
+ dscp_t last_dscp = 0;
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
@@ -2024,19 +2053,20 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
if (fa->fa_slen != slen)
continue;
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp &&
+ fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos))
continue;
if (fa->tb_id != tb->tb_id)
continue;
if (next_fi->fib_priority > last_prio &&
- fa->fa_tos == last_tos) {
- if (last_tos)
+ fa->fa_dscp == last_dscp) {
+ if (last_dscp)
continue;
break;
}
if (next_fi->fib_flags & RTNH_F_DEAD)
continue;
- last_tos = fa->fa_tos;
+ last_dscp = fa->fa_dscp;
last_prio = next_fi->fib_priority;
if (next_fi->fib_scope != res->scope ||
@@ -2092,7 +2122,6 @@ out:
int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
struct fib_info *prev_fi;
- unsigned int hash;
struct hlist_head *head;
struct fib_nh *nh;
int ret;
@@ -2108,8 +2137,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
}
prev_fi = NULL;
- hash = fib_devindex_hashfn(dev->ifindex);
- head = &fib_info_devhash[hash];
+ head = fib_info_devhash_bucket(dev);
ret = 0;
hlist_for_each_entry(nh, head, nh_hash) {
@@ -2211,7 +2239,7 @@ void fib_select_multipath(struct fib_result *res, int hash)
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, const struct sk_buff *skb)
{
- if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF))
+ if (fl4->flowi4_oif)
goto check_saddr;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 8060524f4256..46e8a5125853 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -61,6 +61,7 @@
#include <linux/vmalloc.h>
#include <linux/notifier.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -81,7 +82,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .dscp = fa->fa_dscp,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -98,7 +99,7 @@ static int call_fib_entry_notifiers(struct net *net,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .dscp = fa->fa_dscp,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -497,7 +498,7 @@ static void tnode_free(struct key_vector *tn)
tn = container_of(head, struct tnode, rcu)->kv;
}
- if (tnode_free_size >= sysctl_fib_sync_mem) {
+ if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
tnode_free_size = 0;
synchronize_rcu();
}
@@ -973,13 +974,13 @@ static struct key_vector *fib_find_node(struct trie *t,
return n;
}
-/* Return the first fib alias matching TOS with
+/* Return the first fib alias matching DSCP with
* priority less than or equal to PRIO.
* If 'find_first' is set, return the first matching
- * fib alias, regardless of TOS and priority.
+ * fib alias, regardless of DSCP and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
- u8 tos, u32 prio, u32 tb_id,
+ dscp_t dscp, u32 prio, u32 tb_id,
bool find_first)
{
struct fib_alias *fa;
@@ -988,6 +989,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
return NULL;
hlist_for_each_entry(fa, fah, fa_list) {
+ /* Avoid Sparse warning when using dscp_t in inequalities */
+ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+ u8 __dscp = inet_dscp_to_dsfield(dscp);
+
if (fa->fa_slen < slen)
continue;
if (fa->fa_slen != slen)
@@ -998,9 +1003,9 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
break;
if (find_first)
return fa;
- if (fa->fa_tos > tos)
+ if (__fa_dscp > __dscp)
continue;
- if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
return fa;
}
@@ -1027,7 +1032,7 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
- fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
+ fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
fa->fa_type == fri->type)
return fa;
}
@@ -1047,19 +1052,23 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
if (!fa_match)
goto out;
- if (fa_match->offload == fri->offload && fa_match->trap == fri->trap &&
- fa_match->offload_failed == fri->offload_failed)
+ /* These are paired with the WRITE_ONCE() happening in this function.
+ * The reason is that we are only protected by RCU at this point.
+ */
+ if (READ_ONCE(fa_match->offload) == fri->offload &&
+ READ_ONCE(fa_match->trap) == fri->trap &&
+ READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
goto out;
- fa_match->offload = fri->offload;
- fa_match->trap = fri->trap;
+ WRITE_ONCE(fa_match->offload, fri->offload);
+ WRITE_ONCE(fa_match->trap, fri->trap);
/* 2 means send notifications only if offload_failed was changed. */
if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 &&
- fa_match->offload_failed == fri->offload_failed)
+ READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
goto out;
- fa_match->offload_failed = fri->offload_failed;
+ WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
if (!net->ipv4.sysctl_fib_notify_on_flag_change)
goto out;
@@ -1210,7 +1219,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
int err;
@@ -1227,12 +1236,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto err;
}
+ dscp = cfg->fc_dscp;
l = fib_find_node(t, &tp, key);
- fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
+ * with the same keys [prefix,dscp,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
@@ -1240,7 +1250,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
* of the new alias.
*/
- if (fa && fa->fa_tos == tos &&
+ if (fa && fa->fa_dscp == dscp &&
fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_first, *fa_match;
@@ -1260,7 +1270,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
hlist_for_each_entry_from(fa, fa_list) {
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1288,7 +1298,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
fi_drop = fa->fa_info;
- new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
@@ -1351,7 +1361,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
+ new_fa->fa_dscp = dscp;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
@@ -1419,11 +1429,8 @@ bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
- if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
- if (flp->flowi4_oif &&
- flp->flowi4_oif != nhc->nhc_oif)
- return false;
- }
+ if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
+ return false;
return true;
}
@@ -1567,7 +1574,8 @@ found:
if (index >= (1ul << fa->fa_slen))
continue;
}
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp &&
+ inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
continue;
if (fi->fib_dead)
continue;
@@ -1703,7 +1711,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
struct key_vector *l, *tp;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
key = ntohl(cfg->fc_dst);
@@ -1715,11 +1723,13 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!l)
return -ESRCH;
- fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
+ dscp = cfg->fc_dscp;
+ fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
- pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+ inet_dscp_to_dsfield(dscp), t);
fa_to_delete = NULL;
hlist_for_each_entry_from(fa, fa_list) {
@@ -1727,7 +1737,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -2295,11 +2305,11 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
fri.tb_id = tb->tb_id;
fri.dst = xkey;
fri.dst_len = KEYLENGTH - fa->fa_slen;
- fri.tos = fa->fa_tos;
+ fri.dscp = fa->fa_dscp;
fri.type = fa->fa_type;
- fri.offload = fa->offload;
- fri.trap = fa->trap;
- fri.offload_failed = fa->offload_failed;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed = READ_ONCE(fa->offload_failed);
err = fib_dump_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2615,7 +2625,7 @@ static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- struct net *net = (struct net *)seq->private;
+ struct net *net = seq->private;
unsigned int h;
seq_printf(seq,
@@ -2807,8 +2817,9 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
fa->fa_info->fib_scope),
rtn_type(buf2, sizeof(buf2),
fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
+ if (fa->fa_dscp)
+ seq_printf(seq, " tos=%d",
+ inet_dscp_to_dsfield(fa->fa_dscp));
seq_putc(seq, '\n');
}
}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 8fcbc6258ec5..025a33c1b04d 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -9,13 +9,13 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/genetlink.h>
+#include <net/gro.h>
#include <net/gue.h>
#include <net/fou.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
-#include <net/xfrm.h>
#include <uapi/linux/fou.h>
#include <uapi/linux/genetlink.h>
@@ -246,17 +246,14 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
-
+out:
return pp;
}
@@ -268,19 +265,16 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
const struct net_offload *ops;
int err = -ENOSYS;
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
err = ops->callbacks.gro_complete(skb, nhoff);
skb_set_inner_mac_header(skb, nhoff);
-out_unlock:
- rcu_read_unlock();
-
+out:
return err;
}
@@ -438,17 +432,14 @@ next_proto:
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
- goto out_unlock;
+ goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
flush = 0;
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
@@ -485,18 +476,16 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
return err;
}
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
skb_set_inner_mac_header(skb, nhoff + guehlen);
-out_unlock:
- rcu_read_unlock();
+out:
return err;
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1121a9d5fed9..07073fa35205 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <net/protocol.h>
#include <net/gre.h>
+#include <net/gro.h>
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
@@ -162,10 +163,9 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
type = greh->protocol;
- rcu_read_lock();
ptype = gro_find_receive_by_type(type);
if (!ptype)
- goto out_unlock;
+ goto out;
grehlen = GRE_HEADER_SECTION;
@@ -179,13 +179,13 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
if (skb_gro_header_hard(skb, hlen)) {
greh = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!greh))
- goto out_unlock;
+ goto out;
}
/* Don't bother verifying checksum if we're going to flush anyway. */
if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
if (skb_gro_checksum_simple_validate(skb))
- goto out_unlock;
+ goto out;
skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
null_compute_pseudo);
@@ -229,8 +229,6 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
flush = 0;
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final(skb, pp, flush);
@@ -255,13 +253,10 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
if (greh->flags & GRE_CSUM)
grehlen += GRE_HEADER_SECTION;
- rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype)
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
- rcu_read_unlock();
-
skb_set_inner_mac_header(skb, nhoff + grehlen);
return err;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b7e277d8a84d..57c4f0d87a7a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -186,30 +186,20 @@ EXPORT_SYMBOL(icmp_err_convert);
*/
struct icmp_control {
- bool (*handler)(struct sk_buff *skb);
+ enum skb_drop_reason (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmp_sk(struct net *net)
-{
- return this_cpu_read(*net->ipv4.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmp_sk(net);
+ sk = this_cpu_read(ipv4_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
@@ -217,11 +207,13 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
static inline void icmp_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
@@ -261,11 +253,12 @@ bool icmp_global_allow(void)
spin_lock(&icmp_global.lock);
delta = min_t(u32, now - icmp_global.stamp, HZ);
if (delta >= HZ / 50) {
- incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
+ incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
if (incr)
WRITE_ONCE(icmp_global.stamp, now);
}
- credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
+ credit = min_t(u32, icmp_global.credit + incr,
+ READ_ONCE(sysctl_icmp_msgs_burst));
if (credit) {
/* We want to use a credit of one in average, but need to randomize
* it for security reasons.
@@ -289,7 +282,7 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code)
return true;
/* Limit if icmp type is enabled in ratemask. */
- if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
return true;
return false;
@@ -327,7 +320,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
vif = l3mdev_master_ifindex(dst->dev);
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
- rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+ rc = inet_peer_xrlim_allow(peer,
+ READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
if (peer)
inet_putpeer(peer);
out:
@@ -350,7 +344,7 @@ void icmp_out_count(struct net *net, unsigned char type)
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb)
{
- struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ struct icmp_bxm *icmp_param = from;
__wsum csum;
csum = skb_copy_and_csum_bits(icmp_param->skb,
@@ -363,14 +357,13 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
return 0;
}
-static void icmp_push_reply(struct icmp_bxm *icmp_param,
+static void icmp_push_reply(struct sock *sk,
+ struct icmp_bxm *icmp_param,
struct flowi4 *fl4,
struct ipcm_cookie *ipc, struct rtable **rt)
{
- struct sock *sk;
struct sk_buff *skb;
- sk = icmp_sk(dev_net((*rt)->dst.dev));
if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
@@ -452,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
if (IS_ERR(rt))
goto out_unlock;
if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
@@ -701,7 +694,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_lock();
if (rt_is_input_route(rt) &&
- net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+ READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
if (dev)
@@ -766,7 +759,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);
- icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
@@ -848,8 +841,9 @@ static bool icmp_tag_validation(int proto)
* ICMP_PARAMETERPROB.
*/
-static bool icmp_unreach(struct sk_buff *skb)
+static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
const struct iphdr *iph;
struct icmphdr *icmph;
struct net *net;
@@ -869,8 +863,10 @@ static bool icmp_unreach(struct sk_buff *skb)
icmph = icmp_hdr(skb);
iph = (const struct iphdr *)skb->data;
- if (iph->ihl < 5) /* Mangled header, drop. */
+ if (iph->ihl < 5) { /* Mangled header, drop. */
+ reason = SKB_DROP_REASON_IP_INHDR;
goto out_err;
+ }
switch (icmph->type) {
case ICMP_DEST_UNREACH:
@@ -938,7 +934,7 @@ static bool icmp_unreach(struct sk_buff *skb)
* get the other vendor to fix their kit.
*/
- if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) &&
inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
&ip_hdr(skb)->saddr,
@@ -950,10 +946,10 @@ static bool icmp_unreach(struct sk_buff *skb)
icmp_socket_deliver(skb, info);
out:
- return true;
+ return reason;
out_err:
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return false;
+ return reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
}
@@ -961,20 +957,20 @@ out_err:
* Handle ICMP_REDIRECT.
*/
-static bool icmp_redirect(struct sk_buff *skb)
+static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
- return false;
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
/* there aught to be a stat */
- return false;
+ return SKB_DROP_REASON_NOMEM;
}
icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
- return true;
+ return SKB_NOT_DROPPED_YET;
}
/*
@@ -991,15 +987,15 @@ static bool icmp_redirect(struct sk_buff *skb)
* See also WRT handling of options once they are done and working.
*/
-static bool icmp_echo(struct sk_buff *skb)
+static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
struct icmp_bxm icmp_param;
struct net *net;
net = dev_net(skb_dst(skb)->dev);
/* should there be an ICMP stat for ignored echos? */
- if (net->ipv4.sysctl_icmp_echo_ignore_all)
- return true;
+ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
+ return SKB_NOT_DROPPED_YET;
icmp_param.data.icmph = *icmp_hdr(skb);
icmp_param.skb = skb;
@@ -1010,10 +1006,10 @@ static bool icmp_echo(struct sk_buff *skb)
if (icmp_param.data.icmph.type == ICMP_ECHO)
icmp_param.data.icmph.type = ICMP_ECHOREPLY;
else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
- return true;
+ return SKB_NOT_DROPPED_YET;
icmp_reply(&icmp_param, skb);
- return true;
+ return SKB_NOT_DROPPED_YET;
}
/* Helper for icmp_echo and icmpv6_echo_reply.
@@ -1033,7 +1029,7 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
u16 ident_len;
u8 status;
- if (!net->ipv4.sysctl_icmp_echo_enable_probe)
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
return false;
/* We currently only support probing interfaces on the proxy node
@@ -1131,7 +1127,7 @@ EXPORT_SYMBOL_GPL(icmp_build_probe);
* MUST be accurate to a few minutes.
* MUST be updated at least at 15Hz.
*/
-static bool icmp_timestamp(struct sk_buff *skb)
+static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
struct icmp_bxm icmp_param;
/*
@@ -1156,17 +1152,17 @@ static bool icmp_timestamp(struct sk_buff *skb)
icmp_param.data_len = 0;
icmp_param.head_len = sizeof(struct icmphdr) + 12;
icmp_reply(&icmp_param, skb);
- return true;
+ return SKB_NOT_DROPPED_YET;
out_err:
__ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
- return false;
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
}
-static bool icmp_discard(struct sk_buff *skb)
+static enum skb_drop_reason icmp_discard(struct sk_buff *skb)
{
/* pretend it was a success */
- return true;
+ return SKB_NOT_DROPPED_YET;
}
/*
@@ -1174,18 +1170,20 @@ static bool icmp_discard(struct sk_buff *skb)
*/
int icmp_rcv(struct sk_buff *skb)
{
- struct icmphdr *icmph;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
- bool success;
+ struct icmphdr *icmph;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
int nh;
if (!(sp && sp->xvec[sp->len - 1]->props.flags &
- XFRM_STATE_ICMP))
+ XFRM_STATE_ICMP)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
goto drop;
@@ -1193,8 +1191,11 @@ int icmp_rcv(struct sk_buff *skb)
nh = skb_network_offset(skb);
skb_set_network_header(skb, sizeof(*icmph));
- if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN,
+ skb)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
skb_set_network_header(skb, nh);
}
@@ -1216,13 +1217,13 @@ int icmp_rcv(struct sk_buff *skb)
/* We can't use icmp_pointers[].handler() because it is an array of
* size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42.
*/
- success = icmp_echo(skb);
- goto success_check;
+ reason = icmp_echo(skb);
+ goto reason_check;
}
if (icmph->type == ICMP_EXT_ECHOREPLY) {
- success = ping_rcv(skb);
- goto success_check;
+ reason = ping_rcv(skb);
+ goto reason_check;
}
/*
@@ -1231,8 +1232,10 @@ int icmp_rcv(struct sk_buff *skb)
* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
* discarded.
*/
- if (icmph->type > NR_ICMP_TYPES)
+ if (icmph->type > NR_ICMP_TYPES) {
+ reason = SKB_DROP_REASON_UNHANDLED_PROTO;
goto error;
+ }
/*
* Parse the ICMP message
@@ -1247,28 +1250,31 @@ int icmp_rcv(struct sk_buff *skb)
*/
if ((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP) &&
- net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
}
if (icmph->type != ICMP_ECHO &&
icmph->type != ICMP_TIMESTAMP &&
icmph->type != ICMP_ADDRESS &&
icmph->type != ICMP_ADDRESSREPLY) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
}
}
- success = icmp_pointers[icmph->type].handler(skb);
-success_check:
- if (success) {
+ reason = icmp_pointers[icmph->type].handler(skb);
+reason_check:
+ if (!reason) {
consume_skb(skb);
return NET_RX_SUCCESS;
}
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
csum_error:
+ reason = SKB_DROP_REASON_ICMP_CSUM;
__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
@@ -1434,46 +1440,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
},
};
-static void __net_exit icmp_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
- free_percpu(net->ipv4.icmp_sk);
- net->ipv4.icmp_sk = NULL;
-}
-
static int __net_init icmp_sk_init(struct net *net)
{
- int i, err;
-
- net->ipv4.icmp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.icmp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(i) {
- struct sock *sk;
-
- err = inet_ctl_sock_create(&sk, PF_INET,
- SOCK_RAW, IPPROTO_ICMP, net);
- if (err < 0)
- goto fail;
-
- *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
-
- /* Enough space for 2 64K ICMP packets, including
- * sk_buff/skb_shared_info struct overhead.
- */
- sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
-
- /*
- * Speedup sock_wfree()
- */
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
- }
-
/* Control parameters for ECHO replies. */
net->ipv4.sysctl_icmp_echo_ignore_all = 0;
net->ipv4.sysctl_icmp_echo_enable_probe = 0;
@@ -1499,18 +1467,36 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
return 0;
-
-fail:
- icmp_sk_exit(net);
- return err;
}
static struct pernet_operations __net_initdata icmp_sk_ops = {
.init = icmp_sk_init,
- .exit = icmp_sk_exit,
};
int __init icmp_init(void)
{
+ int err, i;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, &init_net);
+ if (err < 0)
+ return err;
+
+ per_cpu(ipv4_icmp_sk, i) = sk;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff/skb_shared_info struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+ }
return register_pernet_subsys(&icmp_sk_ops);
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d2e2b3d18c66..b65d074d9620 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2403,9 +2403,10 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
/* decrease mem now to avoid the memleak warning */
atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
&sk->sk_omem_alloc);
- kfree_rcu(psl, rcu);
}
rcu_assign_pointer(pmc->sflist, newpsl);
+ if (psl)
+ kfree_rcu(psl, rcu);
psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
@@ -2507,11 +2508,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
/* decrease mem now to avoid the memleak warning */
atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
&sk->sk_omem_alloc);
- kfree_rcu(psl, rcu);
- } else
+ } else {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
+ }
rcu_assign_pointer(pmc->sflist, newpsl);
+ if (psl)
+ kfree_rcu(psl, rcu);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
@@ -2558,7 +2561,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
if (!psl) {
- len = 0;
count = 0;
} else {
count = psl->sl_count;
@@ -2837,7 +2839,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
seq_puts(seq,
"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
else {
- struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct ip_mc_list *im = v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
long delta;
@@ -2981,7 +2983,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
- struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct ip_sf_list *psf = v;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 62a67fdc344c..53f5f956d948 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -155,10 +155,14 @@ static int inet_csk_bind_conflict(const struct sock *sk,
*/
sk_for_each_bound(sk2, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ int bound_dev_if2;
+
+ if (sk == sk2)
+ continue;
+ bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
+ if ((!sk->sk_bound_dev_if ||
+ !bound_dev_if2 ||
+ sk->sk_bound_dev_if == bound_dev_if2)) {
if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
if ((!relax ||
@@ -866,12 +870,9 @@ static void reqsk_timer_handler(struct timer_list *t)
(!resend ||
!inet_rtx_syn_ack(sk_listener, req) ||
inet_rsk(req)->acked)) {
- unsigned long timeo;
-
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
- timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
- mod_timer(&req->rsk_timer, jiffies + timeo);
+ mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));
if (!nreq)
return;
@@ -1035,7 +1036,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, int backlog)
+int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
@@ -1046,6 +1047,9 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
+ if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
+ sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c8fa6e7f7d12..b812eb36f0e3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -261,6 +261,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
+ r->idiag_expires = 0;
if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
sk_user_ns(NETLINK_CB(cb->skb).sk),
@@ -314,9 +315,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires =
jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
- } else {
- r->idiag_timer = 0;
- r->idiag_expires = 0;
}
if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
@@ -1030,12 +1028,13 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
goto skip_listen_ht;
- for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+ for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
num = 0;
- ilb = &hashinfo->listening_hash[i];
+ ilb = &hashinfo->lhash2[i];
+
spin_lock(&ilb->lock);
sk_nulls_for_each(sk, node, &ilb->nulls_head) {
struct inet_sock *inet = inet_sk(sk);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 05cd198d7a6b..c9f9ac5013a7 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -235,9 +235,9 @@ void inet_frag_kill(struct inet_frag_queue *fq)
/* The RCU read lock provides a memory barrier
* guaranteeing that if fqdir->dead is false then
* the hash table destruction will not start until
- * after we unlock. Paired with inet_frags_exit_net().
+ * after we unlock. Paired with fqdir_pre_exit().
*/
- if (!fqdir->dead) {
+ if (!READ_ONCE(fqdir->dead)) {
rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
fqdir->f->rhash_params);
refcount_dec(&fq->refcnt);
@@ -352,9 +352,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
+ /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
+ long high_thresh = READ_ONCE(fqdir->high_thresh);
struct inet_frag_queue *fq = NULL, *prev;
- if (!fqdir->high_thresh || frag_mem_limit(fqdir) > fqdir->high_thresh)
+ if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
return NULL;
rcu_read_lock();
@@ -508,7 +510,7 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
void *reasm_data, bool try_coalesce)
{
- struct sk_buff **nextp = (struct sk_buff **)reasm_data;
+ struct sk_buff **nextp = reasm_data;
struct rb_node *rbn;
struct sk_buff *fp;
int sum_truesize;
@@ -570,6 +572,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
skb_mark_not_on_list(head);
head->prev = NULL;
head->tstamp = q->stamp;
+ head->mono_delivery_time = q->mono_delivery_time;
}
EXPORT_SYMBOL(inet_frag_reasm_finish);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 75737267746f..b9d995b5ce24 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -193,42 +193,6 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
return inet_lhash2_bucket(h, hash);
}
-static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
-{
- struct inet_listen_hashbucket *ilb2;
-
- if (!h->lhash2)
- return;
-
- ilb2 = inet_lhash2_bucket_sk(h, sk);
-
- spin_lock(&ilb2->lock);
- if (sk->sk_reuseport && sk->sk_family == AF_INET6)
- hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
- &ilb2->head);
- else
- hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
- &ilb2->head);
- ilb2->count++;
- spin_unlock(&ilb2->lock);
-}
-
-static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
-{
- struct inet_listen_hashbucket *ilb2;
-
- if (!h->lhash2 ||
- WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
- return;
-
- ilb2 = inet_lhash2_bucket_sk(h, sk);
-
- spin_lock(&ilb2->lock);
- hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
- ilb2->count--;
- spin_unlock(&ilb2->lock);
-}
-
static inline int compute_score(struct sock *sk, struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif, const int sdif)
@@ -282,12 +246,11 @@ static struct sock *inet_lhash2_lookup(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
+ struct hlist_nulls_node *node;
int score, hiscore = 0;
- inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
- sk = (struct sock *)icsk;
+ sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
result = lookup_reuseport(net, sk, skb, doff,
@@ -307,7 +270,7 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
__be32 saddr, __be16 sport,
- __be32 daddr, u16 hnum)
+ __be32 daddr, u16 hnum, const int dif)
{
struct sock *sk, *reuse_sk;
bool no_reuseport;
@@ -315,8 +278,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
if (hashinfo != &tcp_hashinfo)
return NULL; /* only TCP is supported */
- no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP,
- saddr, sport, daddr, hnum, &sk);
+ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
+ daddr, hnum, dif, &sk);
if (no_reuseport || IS_ERR_OR_NULL(sk))
return sk;
@@ -340,7 +303,7 @@ struct sock *__inet_lookup_listener(struct net *net,
/* Lookup redirect from BPF */
if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
- saddr, sport, daddr, hnum);
+ saddr, sport, daddr, hnum, dif);
if (result)
goto done;
}
@@ -410,13 +373,11 @@ begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
continue;
- if (likely(INET_MATCH(sk, net, acookie,
- saddr, daddr, ports, dif, sdif))) {
+ if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
goto out;
- if (unlikely(!INET_MATCH(sk, net, acookie,
- saddr, daddr, ports,
- dif, sdif))) {
+ if (unlikely(!inet_match(net, sk, acookie,
+ ports, dif, sdif))) {
sock_gen_put(sk);
goto begin;
}
@@ -465,8 +426,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
if (sk2->sk_hash != hash)
continue;
- if (likely(INET_MATCH(sk2, net, acookie,
- saddr, daddr, ports, dif, sdif))) {
+ if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
if (twsk_unique(sk, sk2, twp))
@@ -504,7 +464,7 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static u32 inet_sk_port_offset(const struct sock *sk)
+static u64 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -532,16 +492,14 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk,
if (esk->sk_hash != sk->sk_hash)
continue;
if (sk->sk_family == AF_INET) {
- if (unlikely(INET_MATCH(esk, net, acookie,
- sk->sk_daddr,
- sk->sk_rcv_saddr,
+ if (unlikely(inet_match(net, esk, acookie,
ports, dif, sdif))) {
return true;
}
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- if (unlikely(INET6_MATCH(esk, net,
+ if (unlikely(inet6_match(net, esk,
&sk->sk_v6_daddr,
&sk->sk_v6_rcv_saddr,
ports, dif, sdif))) {
@@ -633,33 +591,33 @@ static int inet_reuseport_add_sock(struct sock *sk,
int __inet_hash(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_listen_hashbucket *ilb;
+ struct inet_listen_hashbucket *ilb2;
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
+ local_bh_disable();
inet_ehash_nolisten(sk, osk, NULL);
+ local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
- spin_lock(&ilb->lock);
+ spin_lock(&ilb2->lock);
if (sk->sk_reuseport) {
- err = inet_reuseport_add_sock(sk, ilb);
+ err = inet_reuseport_add_sock(sk, ilb2);
if (err)
goto unlock;
}
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
sk->sk_family == AF_INET6)
- __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
+ __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
else
- __sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
- inet_hash2(hashinfo, sk);
- ilb->count++;
+ __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
- spin_unlock(&ilb->lock);
+ spin_unlock(&ilb2->lock);
return err;
}
@@ -669,11 +627,8 @@ int inet_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
@@ -682,32 +637,41 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_listen_hashbucket *ilb = NULL;
- spinlock_t *lock;
if (sk_unhashed(sk))
return;
if (sk->sk_state == TCP_LISTEN) {
- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &ilb->lock;
+ struct inet_listen_hashbucket *ilb2;
+
+ ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
+ /* Don't disable bottom halves while acquiring the lock to
+ * avoid circular locking dependency on PREEMPT_RT.
+ */
+ spin_lock(&ilb2->lock);
+ if (sk_unhashed(sk)) {
+ spin_unlock(&ilb2->lock);
+ return;
+ }
+
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_stop_listen_sock(sk);
+
+ __sk_nulls_del_node_init_rcu(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock(&ilb2->lock);
} else {
- lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- }
- spin_lock_bh(lock);
- if (sk_unhashed(sk))
- goto unlock;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- if (rcu_access_pointer(sk->sk_reuseport_cb))
- reuseport_stop_listen_sock(sk);
- if (ilb) {
- inet_unhash2(hashinfo, sk);
- ilb->count--;
+ spin_lock_bh(lock);
+ if (sk_unhashed(sk)) {
+ spin_unlock_bh(lock);
+ return;
+ }
+ __sk_nulls_del_node_init_rcu(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(lock);
}
- __sk_nulls_del_node_init_rcu(sk);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-unlock:
- spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -715,15 +679,17 @@ EXPORT_SYMBOL_GPL(inet_unhash);
* Note that we use 32bit integers (vs RFC 'short integers')
* because 2^16 is not a multiple of num_ephemeral and this
* property might be used by clever attacker.
- * RFC claims using TABLE_LENGTH=10 buckets gives an improvement,
- * we use 256 instead to really give more isolation and
- * privacy, this only consumes 1 KB of kernel memory.
+ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
+ * attacks were since demonstrated, thus we use 65536 instead to really
+ * give more isolation and privacy, at the expense of 256kB of kernel
+ * memory.
*/
-#define INET_TABLE_PERTURB_SHIFT 8
-static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT];
+#define INET_TABLE_PERTURB_SHIFT 16
+#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT)
+static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
- struct sock *sk, u32 port_offset,
+ struct sock *sk, u64 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **))
{
@@ -763,10 +729,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
if (likely(remaining > 1))
remaining &= ~1U;
- net_get_random_once(table_perturb, sizeof(table_perturb));
- index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT);
+ net_get_random_once(table_perturb,
+ INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
+ index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
+
+ offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
+ offset %= remaining;
- offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining;
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
@@ -820,11 +789,12 @@ next_port:
return -EADDRNOTAVAIL;
ok:
- /* If our first attempt found a candidate, skip next candidate
- * in 1/16 of cases to add some noise.
+ /* Here we want to add a little bit of randomness to the next source
+ * port that will be chosen. We use a max() with a random here so that
+ * on low contention the randomness is maximal and on high contention
+ * it may be inexistent.
*/
- if (!i && !(prandom_u32() % 16))
- i = 2;
+ i = max_t(int, i, (prandom_u32() & 7) * 2);
WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */
@@ -848,7 +818,7 @@ ok:
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- u32 port_offset = 0;
+ u64 port_offset = 0;
if (!inet_sk(sk)->inet_num)
port_offset = inet_sk_port_offset(sk);
@@ -857,29 +827,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
-void inet_hashinfo_init(struct inet_hashinfo *h)
-{
- int i;
-
- for (i = 0; i < INET_LHTABLE_SIZE; i++) {
- spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head,
- i + LISTENING_NULLS_BASE);
- h->listening_hash[i].count = 0;
- }
-
- h->lhash2 = NULL;
-}
-EXPORT_SYMBOL_GPL(inet_hashinfo_init);
-
static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
int i;
for (i = 0; i <= h->lhash2_mask; i++) {
spin_lock_init(&h->lhash2[i].lock);
- INIT_HLIST_HEAD(&h->lhash2[i].head);
- h->lhash2[i].count = 0;
+ INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
+ i + LISTENING_NULLS_BASE);
}
}
@@ -898,6 +853,14 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
low_limit,
high_limit);
init_hashinfo_lhash2(h);
+
+ /* this one is used for source ports of outgoing connections */
+ table_perturb = alloc_large_system_hash("Table-perturb",
+ sizeof(*table_perturb),
+ INET_TABLE_PERTURB_SIZE,
+ 0, 0, NULL, NULL,
+ INET_TABLE_PERTURB_SIZE,
+ INET_TABLE_PERTURB_SIZE);
}
int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 437afe392e66..47ccc343c9fb 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -59,7 +59,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
- atomic_dec(&tw->tw_dr->tw_count);
+ if (refcount_dec_and_test(&tw->tw_dr->tw_refcount))
+ kfree(tw->tw_dr);
+
inet_twsk_put(tw);
}
@@ -145,10 +147,6 @@ static void tw_timer_handler(struct timer_list *t)
{
struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
- if (tw->tw_kill)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
- else
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
@@ -158,7 +156,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
{
struct inet_timewait_sock *tw;
- if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ if (refcount_read(&dr->tw_refcount) - 1 >=
+ READ_ONCE(dr->sysctl_max_tw_buckets))
return NULL;
tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
@@ -244,10 +243,13 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
* of PAWS.
*/
- tw->tw_kill = timeo <= 4*HZ;
if (!rearm) {
+ bool kill = timeo <= 4*HZ;
+
+ __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+ LINUX_MIB_TIMEWAITED);
BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
- atomic_inc(&tw->tw_dr->tw_count);
+ refcount_inc(&tw->tw_dr->tw_refcount);
} else {
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index da21dfce24d7..e9fed83e9b3c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -141,16 +141,20 @@ static void inet_peer_gc(struct inet_peer_base *base,
struct inet_peer *gc_stack[],
unsigned int gc_cnt)
{
+ int peer_threshold, peer_maxttl, peer_minttl;
struct inet_peer *p;
__u32 delta, ttl;
int i;
- if (base->total >= inet_peer_threshold)
+ peer_threshold = READ_ONCE(inet_peer_threshold);
+ peer_maxttl = READ_ONCE(inet_peer_maxttl);
+ peer_minttl = READ_ONCE(inet_peer_minttl);
+
+ if (base->total >= peer_threshold)
ttl = 0; /* be aggressive */
else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- base->total / inet_peer_threshold * HZ;
+ ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ *
+ base->total / peer_threshold * HZ;
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 00ec819f949b..e3aa436a1bdf 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -79,7 +79,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(opt->optlen))
ip_forward_options(skb);
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
return dst_output(net, sk, skb);
}
@@ -90,6 +90,7 @@ int ip_forward(struct sk_buff *skb)
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
struct net *net;
+ SKB_DR(reason);
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
@@ -101,8 +102,10 @@ int ip_forward(struct sk_buff *skb)
if (skb_warn_if_lro(skb))
goto drop;
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
+ }
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
@@ -118,8 +121,10 @@ int ip_forward(struct sk_buff *skb)
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
- if (!xfrm4_route_forward(skb))
+ if (!xfrm4_route_forward(skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
+ }
rt = skb_rtable(skb);
@@ -132,6 +137,7 @@ int ip_forward(struct sk_buff *skb)
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
+ SKB_DR_SET(reason, PKT_TOO_BIG);
goto drop;
}
@@ -169,7 +175,8 @@ too_many_hops:
/* Tell the sender its packet died... */
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ SKB_DR_SET(reason, IP_INHDR);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cfeb8890f94e..fb153569889e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -144,7 +144,8 @@ static void ip_expire(struct timer_list *t)
rcu_read_lock();
- if (qp->q.fqdir->dead)
+ /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(qp->q.fqdir->dead))
goto out_rcu_unlock;
spin_lock(&qp->q.lock);
@@ -348,6 +349,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->iif = dev->ifindex;
qp->q.stamp = skb->tstamp;
+ qp->q.mono_delivery_time = skb->mono_delivery_time;
qp->q.meat += skb->len;
qp->ecn |= ecn;
add_frag_mem_limit(qp->q.fqdir, skb->truesize);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2ac2b95c5694..5c58e21f724e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -459,14 +459,12 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
-
- if (tunnel->parms.o_flags & TUNNEL_SEQ)
- tunnel->o_seqno++;
+ __be16 flags = tunnel->parms.o_flags;
/* Push GRE header. */
gre_build_header(skb, tunnel->tun_hlen,
- tunnel->parms.o_flags, proto, tunnel->parms.o_key,
- htonl(tunnel->o_seqno));
+ flags, proto, tunnel->parms.o_key,
+ (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
@@ -504,7 +502,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
gre_build_header(skb, tunnel_hlen, flags, proto,
tunnel_id_to_key32(tun_info->key.tun_id),
- (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+ (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
@@ -526,7 +524,6 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
int tunnel_hlen;
int version;
int nhoff;
- int thoff;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -560,10 +557,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
- if (skb->protocol == htons(ETH_P_IPV6) &&
- (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
- truncate = true;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_header(skb) - skb_mac_header(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
@@ -581,7 +584,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
}
gre_build_header(skb, 8, TUNNEL_SEQ,
- proto, 0, htonl(tunnel->o_seqno++));
+ proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
@@ -604,7 +607,8 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
key = &info->key;
ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
- tunnel_id_to_key32(key->tun_id), key->tos, 0,
+ tunnel_id_to_key32(key->tun_id),
+ key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
skb->mark, skb_get_hash(skb));
rt = ip_route_output_key(dev_net(dev), &fl4);
if (IS_ERR(rt))
@@ -630,21 +634,20 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
}
if (dev->header_ops) {
- const int pull_len = tunnel->hlen + sizeof(struct iphdr);
-
if (skb_cow_head(skb, 0))
goto free_skb;
tnl_params = (const struct iphdr *)skb->data;
- if (pull_len > skb_transport_offset(skb))
- goto free_skb;
-
/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
* to gre header.
*/
- skb_pull(skb, pull_len);
+ skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
skb_reset_mac_header(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_start(skb) < skb->data)
+ goto free_skb;
} else {
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
@@ -749,6 +752,7 @@ free_skb:
static void ipgre_link_update(struct net_device *dev, bool set_mtu)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ __be16 flags;
int len;
len = tunnel->tun_hlen;
@@ -764,19 +768,15 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
if (set_mtu)
dev->mtu = max_t(int, dev->mtu - len, 68);
- if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
- if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
- tunnel->encap.type == TUNNEL_ENCAP_NONE) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- } else {
- dev->features &= ~NETIF_F_GSO_SOFTWARE;
- dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
- }
- dev->features |= NETIF_F_LLTX;
- } else {
+ flags = tunnel->parms.o_flags;
+
+ if (flags & TUNNEL_SEQ ||
+ (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+ dev->features &= ~NETIF_F_GSO_SOFTWARE;
dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
- dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
+ } else {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
}
}
@@ -950,6 +950,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
+ __be16 flags;
tunnel = netdev_priv(dev);
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -958,25 +959,21 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
- dev->features |= GRE_FEATURES;
+ dev->features |= GRE_FEATURES | NETIF_F_LLTX;
dev->hw_features |= GRE_FEATURES;
- if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported, nor
- * can we support 2 levels of outer headers requiring
- * an update.
- */
- if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
- (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- }
+ flags = tunnel->parms.o_flags;
- /* Can use a lockless transmit, unless we generate
- * output sequences
- */
- dev->features |= NETIF_F_LLTX;
- }
+ /* TCP offload with GRE SEQ is not supported, nor can we support 2
+ * levels of outer headers requiring an update.
+ */
+ if (flags & TUNNEL_SEQ)
+ return;
+ if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ return;
+
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
}
static int ipgre_tunnel_init(struct net_device *dev)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3a025c011971..b1165f717cd1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -196,7 +196,8 @@ resubmit:
if (ipprot) {
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_XFRM_POLICY);
return;
}
nf_reset_ct(skb);
@@ -215,7 +216,7 @@ resubmit:
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
@@ -225,6 +226,7 @@ resubmit:
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ skb_clear_delivery_time(skb);
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
@@ -318,8 +320,10 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
+ int err, drop_reason;
struct rtable *rt;
- int err;
+
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (ip_can_use_hint(skb, iph, hint)) {
err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
@@ -396,19 +400,23 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
- IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+ drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
goto drop;
+ }
}
return NET_RX_SUCCESS;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return NET_RX_DROP;
drop_error:
- if (err == -EXDEV)
+ if (err == -EXDEV) {
+ drop_reason = SKB_DROP_REASON_IP_RPFILTER;
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+ }
goto drop;
}
@@ -436,13 +444,17 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
+ int drop_reason;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
- if (skb->pkt_type == PACKET_OTHERHOST)
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+ drop_reason = SKB_DROP_REASON_OTHERHOST;
goto drop;
+ }
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
@@ -452,6 +464,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
goto out;
}
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
@@ -488,6 +501,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
len = ntohs(iph->tot_len);
if (skb->len < len) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
@@ -516,11 +530,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
return skb;
csum_error:
+ drop_reason = SKB_DROP_REASON_IP_CSUM;
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
+ if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ drop_reason = SKB_DROP_REASON_IP_INHDR;
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out:
return NULL;
}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index da1b5038bdfd..a9e22a098872 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -42,7 +42,7 @@
*/
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
- __be32 daddr, struct rtable *rt, int is_frag)
+ __be32 daddr, struct rtable *rt)
{
unsigned char *iph = skb_network_header(skb);
@@ -53,28 +53,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
if (opt->srr)
memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
- if (!is_frag) {
- if (opt->rr_needaddr)
- ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
- if (opt->ts_needaddr)
- ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
- if (opt->ts_needtime) {
- __be32 midtime;
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+ if (opt->ts_needtime) {
+ __be32 midtime;
- midtime = inet_current_timestamp();
- memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
- }
- return;
- }
- if (opt->rr) {
- memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]);
- opt->rr = 0;
- opt->rr_needaddr = 0;
- }
- if (opt->ts) {
- memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]);
- opt->ts = 0;
- opt->ts_needaddr = opt->ts_needtime = 0;
+ midtime = inet_current_timestamp();
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9bca57ef8b83..00b4bf26fd93 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -162,17 +162,24 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- if (ip_dont_fragment(sk, &rt->dst)) {
+ /* Do not bother generating IPID for small packets (eg SYNACK) */
+ if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
iph->frag_off = htons(IP_DF);
iph->id = 0;
} else {
iph->frag_off = 0;
- __ip_select_ident(net, iph, 1);
+ /* TCP packets here are SYNACK with fat IPv4/TCP options.
+ * Avoid using the hashed IP ident generator.
+ */
+ if (sk->sk_protocol == IPPROTO_TCP)
+ iph->id = (__force __be16)prandom_u32();
+ else
+ __ip_select_ident(net, iph, 1);
}
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
- ip_options_build(skb, &opt->opt, daddr, rt, 0);
+ ip_options_build(skb, &opt->opt, daddr, rt);
}
skb->priority = sk->sk_priority;
@@ -226,7 +233,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
return -EINVAL;
}
@@ -310,7 +317,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
case NET_XMIT_CN:
return __ip_finish_output(net, sk, skb) ? : ret;
default:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
return ret;
}
}
@@ -330,7 +337,7 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk,
case NET_XMIT_SUCCESS:
break;
default:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
return ret;
}
@@ -512,7 +519,7 @@ packet_routed:
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
- ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
}
ip_select_ident_segs(net, skb, sk,
@@ -529,7 +536,7 @@ packet_routed:
no_route:
rcu_read_unlock();
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
return -EHOSTUNREACH;
}
EXPORT_SYMBOL(__ip_queue_xmit);
@@ -672,7 +679,6 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
struct sk_buff *skb2;
struct iphdr *iph;
- len = state->left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > state->mtu)
len = state->mtu;
@@ -755,6 +761,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
{
struct iphdr *iph;
struct sk_buff *skb2;
+ bool mono_delivery_time = skb->mono_delivery_time;
struct rtable *rt = skb_rtable(skb);
unsigned int mtu, hlen, ll_rs;
struct ip_fraglist_iter iter;
@@ -826,18 +833,27 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
/* Everything is OK. Generate! */
ip_fraglist_init(skb, iph, hlen, &iter);
- if (iter.frag)
- ip_options_fragment(iter.frag);
-
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (iter.frag) {
+ bool first_frag = (iter.offset == 0);
+
IPCB(iter.frag)->flags = IPCB(skb)->flags;
ip_fraglist_prepare(skb, &iter);
+ if (first_frag && IPCB(skb)->opt.optlen) {
+ /* ipcb->opt is not populated for frags
+ * coming from __ip_make_skb(),
+ * ip_options_fragment() needs optlen
+ */
+ IPCB(iter.frag)->opt.optlen =
+ IPCB(skb)->opt.optlen;
+ ip_options_fragment(iter.frag);
+ ip_send_check(iter.iph);
+ }
}
- skb->tstamp = tstamp;
+ skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, skb);
if (!err)
@@ -893,7 +909,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
- skb2->tstamp = tstamp;
+ skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -976,7 +992,7 @@ static int __ip_append_data(struct sock *sk,
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
- tskey = sk->sk_tskey++;
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1526,7 +1542,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (opt) {
iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, cork->addr, rt, 0);
+ ip_options_build(skb, opt, cork->addr, rt);
}
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
@@ -1712,6 +1728,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
+ nskb->mono_delivery_time = !!transmit_time;
ip_push_pending_frames(sk, &fl4);
}
out:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 38d29b175ca6..445a9ecaefa1 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -576,7 +576,7 @@ out:
return err;
}
-static void __ip_sock_set_tos(struct sock *sk, int val)
+void __ip_sock_set_tos(struct sock *sk, int val)
{
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 5a473319d3a5..94017a8c3994 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -294,8 +294,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
- RT_TOS(iph->tos), tunnel->parms.link,
- tunnel->fwmark, 0);
+ RT_TOS(iph->tos), dev_net(dev),
+ tunnel->parms.link, tunnel->fwmark, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
if (!IS_ERR(rt)) {
@@ -570,7 +570,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
- 0, skb->mark, skb_get_hash(skb));
+ dev_net(dev), 0, skb->mark, skb_get_hash(skb));
if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
goto tx_error;
@@ -726,7 +726,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
- tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
+ tunnel->parms.o_key, RT_TOS(tos),
+ dev_net(dev), tunnel->parms.link,
tunnel->fwmark, skb_get_hash(skb));
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6b2dc7b2b612..cc1caab4a654 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -410,7 +410,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
u32 mtu = dst_mtu(encap_dst) - headroom;
if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
- (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
+ (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
return 0;
skb_dst_update_pmtu_no_confirm(skb, mtu);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2dda856ca260..13e6329784fb 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -195,10 +195,6 @@ static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
return 1;
}
-static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
- FRA_GENERIC_POLICY,
-};
-
static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh, struct nlattr **tb,
struct netlink_ext_ack *extack)
@@ -231,7 +227,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.compare = ipmr_rule_compare,
.fill = ipmr_rule_fill,
.nlgroup = RTNLGRP_IPV4_RULE,
- .policy = ipmr_rule_policy,
.owner = THIS_MODULE,
};
@@ -261,7 +256,9 @@ static int __net_init ipmr_rules_init(struct net *net)
return 0;
err2:
+ rtnl_lock();
ipmr_free_table(mrt);
+ rtnl_unlock();
err1:
fib_rules_unregister(ops);
return err;
@@ -271,13 +268,12 @@ static void __net_exit ipmr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
list_del(&mrt->list);
ipmr_free_table(mrt);
}
fib_rules_unregister(net->ipv4.mr_rules_ops);
- rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -333,10 +329,9 @@ static int __net_init ipmr_rules_init(struct net *net)
static void __net_exit ipmr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ipmr_free_table(net->ipv4.mrt);
net->ipv4.mrt = NULL;
- rtnl_unlock();
}
static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
@@ -361,7 +356,7 @@ static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
const struct mfc_cache_cmp_arg *cmparg = arg->key;
- struct mfc_cache *c = (struct mfc_cache *)ptr;
+ const struct mfc_cache *c = ptr;
return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
cmparg->mfc_origin != c->mfc_origin;
@@ -696,7 +691,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
- dev_put(dev);
+ dev_put_track(dev, &v->dev_tracker);
return 0;
}
@@ -896,6 +891,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
v->dev = dev;
+ netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
if (v->flags & VIFF_REGISTER)
mrt->mroute_reg_vif_num = vifi;
if (vifi+1 > mrt->maxvif)
@@ -3079,7 +3075,9 @@ static int __net_init ipmr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ipmr_rules_exit(net);
+ rtnl_unlock();
#endif
ipmr_rules_fail:
ipmr_notifier_exit(net);
@@ -3094,12 +3092,22 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_notifier_exit(net);
- ipmr_rules_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ipmr_rules_exit(net);
+ rtnl_unlock();
}
static struct pernet_operations ipmr_net_ops = {
.init = ipmr_net_init,
.exit = ipmr_net_exit,
+ .exit_batch = ipmr_net_exit_batch,
};
int __init ip_mr_init(void)
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index aff707988e23..bd135165482a 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -45,8 +45,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
- if (!fl4.flowi4_oif)
- fl4.flowi4_oif = l3mdev_master_ifindex(dev);
+ fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_flags = flags;
fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 63cb953bd019..aab384126f61 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -58,14 +58,6 @@ config NF_TABLES_ARP
endif # NF_TABLES
-config NF_FLOW_TABLE_IPV4
- tristate "Netfilter flow table IPv4 module"
- depends on NF_FLOW_TABLE
- help
- This option adds the flow table IPv4 support.
-
- To compile it as a module, choose M here.
-
config NF_DUP_IPV4
tristate "Netfilter IPv4 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index f38fb1368ddb..93bad1184251 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -24,9 +24,6 @@ obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
-# flow table support
-obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
-
# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 8fd1aba8af31..f8e176c77d1c 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -520,8 +520,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
if (IS_ERR(config))
return PTR_ERR(config);
}
- } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN))
+ } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) {
+ clusterip_config_entry_put(config);
+ clusterip_config_put(config);
return -EINVAL;
+ }
ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
@@ -773,7 +776,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- struct clusterip_config *c = PDE_DATA(inode);
+ struct clusterip_config *c = pde_data(inode);
sf->private = c;
@@ -785,7 +788,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
static int clusterip_proc_release(struct inode *inode, struct file *file)
{
- struct clusterip_config *c = PDE_DATA(inode);
+ struct clusterip_config *c = pde_data(inode);
int ret;
ret = seq_release(inode, file);
@@ -799,7 +802,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *ofs)
{
- struct clusterip_config *c = PDE_DATA(file_inode(file));
+ struct clusterip_config *c = pde_data(file_inode(file));
#define PROC_WRITELEN 10
char buffer[PROC_WRITELEN+1];
unsigned long nodenum;
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
deleted file mode 100644
index aba65fe90345..000000000000
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_tables.h>
-
-static struct nf_flowtable_type flowtable_ipv4 = {
- .family = NFPROTO_IPV4,
- .init = nf_flow_table_init,
- .setup = nf_flow_table_offload_setup,
- .action = nf_flow_rule_route_ipv4,
- .free = nf_flow_table_free,
- .hook = nf_flow_offload_ip_hook,
- .owner = THIS_MODULE,
-};
-
-static int __init nf_flow_ipv4_module_init(void)
-{
- nft_register_flowtable_type(&flowtable_ipv4);
-
- return 0;
-}
-
-static void __exit nf_flow_ipv4_module_exit(void)
-{
- nft_unregister_flowtable_type(&flowtable_ipv4);
-}
-
-module_init(nf_flow_ipv4_module_init);
-module_exit(nf_flow_ipv4_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
-MODULE_DESCRIPTION("Netfilter flow table support");
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 3e2685c120c7..76a411ae9fe6 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -580,7 +580,7 @@ static struct nf_ct_helper_expectfn callforwarding_nat = {
};
/****************************************************************************/
-static int __init init(void)
+static int __init nf_nat_h323_init(void)
{
BUG_ON(set_h245_addr_hook != NULL);
BUG_ON(set_h225_addr_hook != NULL);
@@ -607,7 +607,7 @@ static int __init init(void)
}
/****************************************************************************/
-static void __exit fini(void)
+static void __exit nf_nat_h323_fini(void)
{
RCU_INIT_POINTER(set_h245_addr_hook, NULL);
RCU_INIT_POINTER(set_h225_addr_hook, NULL);
@@ -624,8 +624,8 @@ static void __exit fini(void)
}
/****************************************************************************/
-module_init(init);
-module_exit(fini);
+module_init(nf_nat_h323_init);
+module_exit(nf_nat_h323_fini);
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 NAT helper");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 3f248a19faa3..fab357cc8559 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -295,28 +295,24 @@ pptp_inbound_pkt(struct sk_buff *skb,
return NF_ACCEPT;
}
+static const struct nf_nat_pptp_hook pptp_hooks = {
+ .outbound = pptp_outbound_pkt,
+ .inbound = pptp_inbound_pkt,
+ .exp_gre = pptp_exp_gre,
+ .expectfn = pptp_nat_expected,
+};
+
static int __init nf_nat_helper_pptp_init(void)
{
- BUG_ON(nf_nat_pptp_hook_outbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_inbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+ WARN_ON(nf_nat_pptp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
- BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
return 0;
}
static void __exit nf_nat_helper_pptp_fini(void)
{
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
synchronize_rcu();
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 4eed5afca392..918c61fda0f3 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -80,6 +80,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
struct iphdr *niph;
struct icmphdr *icmph;
unsigned int len;
+ int dataoff;
__wsum csum;
u8 proto;
@@ -99,10 +100,11 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
return NULL;
+ dataoff = ip_hdrlen(oldskb);
proto = ip_hdr(oldskb)->protocol;
if (!skb_csum_unnecessary(oldskb) &&
- nf_reject_verify_csum(proto) &&
+ nf_reject_verify_csum(oldskb, dataoff, proto) &&
nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
return NULL;
@@ -311,6 +313,7 @@ EXPORT_SYMBOL_GPL(nf_send_reset);
void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
{
struct iphdr *iph = ip_hdr(skb_in);
+ int dataoff = ip_hdrlen(skb_in);
u8 proto = iph->protocol;
if (iph->frag_off & htons(IP_OFFSET))
@@ -320,12 +323,13 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
nf_reject_fill_skb_dst(skb_in) < 0)
return;
- if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) {
+ if (skb_csum_unnecessary(skb_in) ||
+ !nf_reject_verify_csum(skb_in, dataoff, proto)) {
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
return;
}
- if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0)
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
}
EXPORT_SYMBOL_GPL(nf_send_unreach);
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index aeb631760eb9..0bcd6aee6000 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -75,6 +75,7 @@ static const struct nft_expr_ops nft_dup_ipv4_ops = {
.eval = nft_dup_ipv4_eval,
.init = nft_dup_ipv4_init,
.dump = nft_dup_ipv4_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 03df986217b7..b75cac69bd7e 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -112,6 +112,10 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
fl4.daddr = iph->daddr;
fl4.saddr = get_saddr(iph->saddr);
} else {
+ if (nft_hook(pkt) == NF_INET_FORWARD &&
+ priv->flags & NFTA_FIB_F_IIF)
+ fl4.flowi4_iif = nft_out(pkt)->ifindex;
+
fl4.daddr = iph->saddr;
fl4.saddr = get_saddr(iph->daddr);
}
@@ -152,6 +156,7 @@ static const struct nft_expr_ops nft_fib4_type_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops nft_fib4_ops = {
@@ -161,6 +166,7 @@ static const struct nft_expr_ops nft_fib4_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops *
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index 55fc23a8f7a7..6cb213bb7256 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -45,6 +45,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
.init = nft_reject_init,
.dump = nft_reject_dump,
.validate = nft_reject_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5dbd4b5505eb..853a75a8fbaf 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -8,6 +8,7 @@
#include <linux/nexthop.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <net/arp.h>
#include <net/ipv6_stubs.h>
#include <net/lwtunnel.h>
@@ -1857,7 +1858,7 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
/* __ip6_del_rt does a release, so do a hold here */
fib6_info_hold(f6i);
ipv6_stub->ip6_del_rt(net, f6i,
- !net->ipv4.sysctl_nexthop_compat_mode);
+ !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
}
}
@@ -1918,9 +1919,6 @@ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
if (!replaced_nh->is_group)
return;
- /* new dsts must use only the new nexthop group */
- synchronize_net();
-
nhg = rtnl_dereference(replaced_nh->nh_grp);
for (i = 0; i < nhg->num_nh; i++) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
@@ -2002,9 +2000,10 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
rcu_assign_pointer(old->nh_grp, newg);
+ /* Make sure concurrent readers are not using 'oldg' anymore. */
+ synchronize_net();
+
if (newg->resilient) {
- /* Make sure concurrent readers are not using 'oldg' anymore. */
- synchronize_net();
rcu_assign_pointer(oldg->res_table, tmp_table);
rcu_assign_pointer(oldg->spare->res_table, tmp_table);
}
@@ -2362,7 +2361,8 @@ out:
if (!rc) {
nh_base_seq_inc(net);
nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
- if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode)
+ if (replace_notify &&
+ READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
}
@@ -3734,12 +3734,16 @@ out:
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);
-static void __net_exit nexthop_net_exit(struct net *net)
+static void __net_exit nexthop_net_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- flush_all_nexthops(net);
+ list_for_each_entry(net, net_list, exit_list) {
+ flush_all_nexthops(net);
+ kfree(net->nexthop.devhash);
+ }
rtnl_unlock();
- kfree(net->nexthop.devhash);
}
static int __net_init nexthop_net_init(struct net *net)
@@ -3757,7 +3761,7 @@ static int __net_init nexthop_net_init(struct net *net)
static struct pernet_operations nexthop_net_ops = {
.init = nexthop_net_init,
- .exit = nexthop_net_exit,
+ .exit_batch = nexthop_net_exit_batch,
};
static int __init nexthop_init(void)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 1e44a43acfe2..3c6101def7d6 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -172,16 +172,22 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
struct sock *sk = NULL;
struct inet_sock *isk;
struct hlist_nulls_node *hnode;
- int dif = skb->dev->ifindex;
+ int dif, sdif;
if (skb->protocol == htons(ETH_P_IP)) {
+ dif = inet_iif(skb);
+ sdif = inet_sdif(skb);
pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
(int)ident, &ip_hdr(skb)->daddr, dif);
#if IS_ENABLED(CONFIG_IPV6)
} else if (skb->protocol == htons(ETH_P_IPV6)) {
+ dif = inet6_iif(skb);
+ sdif = inet6_sdif(skb);
pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
(int)ident, &ipv6_hdr(skb)->daddr, dif);
#endif
+ } else {
+ return NULL;
}
read_lock_bh(&ping_table.lock);
@@ -220,7 +226,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
continue;
}
- if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif)
continue;
sock_hold(sk);
@@ -298,6 +305,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
struct net *net = sock_net(sk);
if (sk->sk_family == AF_INET) {
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ u32 tb_id = RT_TABLE_LOCAL;
int chk_addr_ret;
if (addr_len < sizeof(*addr))
@@ -312,14 +320,15 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
- chk_addr_ret = RTN_LOCAL;
- else
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+ return 0;
+
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
- if ((!inet_can_nonlocal_bind(net, isk) &&
- chk_addr_ret != RTN_LOCAL) ||
- chk_addr_ret == RTN_MULTICAST ||
- chk_addr_ret == RTN_BROADCAST)
+ if (chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST ||
+ (chk_addr_ret != RTN_LOCAL &&
+ !inet_can_nonlocal_bind(net, isk)))
return -EADDRNOTAVAIL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -352,6 +361,14 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
return -ENODEV;
}
}
+
+ if (!dev && sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
scoped);
rcu_read_unlock();
@@ -587,7 +604,7 @@ EXPORT_SYMBOL_GPL(ping_err);
int ping_getfrag(void *from, char *to,
int offset, int fraglen, int odd, struct sk_buff *skb)
{
- struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+ struct pingfakehdr *pfh = from;
if (offset == 0) {
fraglen -= sizeof(struct icmphdr);
@@ -841,8 +858,8 @@ do_confirm:
goto out;
}
-int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len)
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
struct inet_sock *isk = inet_sk(sk);
int family = sk->sk_family;
@@ -858,7 +875,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
if (flags & MSG_ERRQUEUE)
return inet_recv_error(sk, msg, len, addr_len);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -931,16 +948,24 @@ out:
}
EXPORT_SYMBOL_GPL(ping_recvmsg);
-int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
+ struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
inet_sk(sk), inet_sk(sk)->inet_num, skb);
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- kfree_skb(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ kfree_skb_reason(skb, reason);
pr_debug("ping_queue_rcv_skb -> failed\n");
- return -1;
+ return reason;
}
- return 0;
+ return SKB_NOT_DROPPED_YET;
+}
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
}
EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
@@ -949,12 +974,12 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
* All we need to do is get the socket.
*/
-bool ping_rcv(struct sk_buff *skb)
+enum skb_drop_reason ping_rcv(struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET;
struct sock *sk;
struct net *net = dev_net(skb->dev);
struct icmphdr *icmph = icmp_hdr(skb);
- bool rc = false;
/* We assume the packet has already been checked by icmp_rcv */
@@ -969,15 +994,17 @@ bool ping_rcv(struct sk_buff *skb)
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
pr_debug("rcv on socket %p\n", sk);
- if (skb2 && !ping_queue_rcv_skb(sk, skb2))
- rc = true;
+ if (skb2)
+ reason = __ping_queue_rcv_skb(sk, skb2);
+ else
+ reason = SKB_DROP_REASON_NOMEM;
sock_put(sk);
}
- if (!rc)
+ if (reason)
pr_debug("no socket, dropping\n");
- return rc;
+ return reason;
}
EXPORT_SYMBOL_GPL(ping_rcv);
@@ -998,6 +1025,7 @@ struct proto ping_prot = {
.hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
+ .put_port = ping_unhash,
.obj_size = sizeof(struct inet_sock),
};
EXPORT_SYMBOL(ping_prot);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f30273afb539..28836071f0a6 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,8 +59,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
- proto_memory_allocated(&tcp_prot));
+ refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1,
+ sockets, proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bb446e60cf58..bbd717805b10 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -99,8 +99,8 @@ int raw_hash_sk(struct sock *sk)
write_lock_bh(&h->lock);
sk_add_node(sk, head);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&h->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -717,30 +717,34 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ struct net *net = sock_net(sk);
u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
+ lock_sock(sk);
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
if (sk->sk_bound_dev_if)
- tb_id = l3mdev_fib_table_by_index(sock_net(sk),
- sk->sk_bound_dev_if) ? : tb_id;
+ tb_id = l3mdev_fib_table_by_index(net,
+ sk->sk_bound_dev_if) ? : tb_id;
- chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
- tb_id);
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
ret = -EADDRNOTAVAIL;
- if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
+
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
sk_dst_reset(sk);
ret = 0;
-out: return ret;
+out:
+ release_sock(sk);
+ return ret;
}
/*
@@ -749,7 +753,7 @@ out: return ret;
*/
static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
@@ -765,7 +769,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
}
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -779,7 +783,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (err)
goto done;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -970,7 +974,7 @@ struct proto raw_prot = {
static struct sock *raw_get_first(struct seq_file *seq)
{
struct sock *sk;
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
@@ -986,7 +990,7 @@ found:
static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
{
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
do {
@@ -1015,7 +1019,7 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(&h->lock)
{
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
read_lock(&h->lock);
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -1038,7 +1042,7 @@ EXPORT_SYMBOL_GPL(raw_seq_next);
void raw_seq_stop(struct seq_file *seq, void *v)
__releases(&h->lock)
{
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
read_unlock(&h->lock);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0b4103b1e622..356f535f3443 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -84,6 +84,7 @@
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
@@ -110,15 +111,15 @@
#define RT_GC_TIMEOUT (300*HZ)
+#define DEFAULT_MIN_PMTU (512 + 20 + 20)
+#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
+#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
-static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
-static int ip_rt_min_advmss __read_mostly = 256;
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
@@ -457,7 +458,7 @@ static u32 *ip_tstamps __read_mostly;
* if one generator is seldom used. This makes hard for an attacker
* to infer how many packets were sent between two points in time.
*/
-u32 ip_idents_reserve(u32 hash, int segs)
+static u32 ip_idents_reserve(u32 hash, int segs)
{
u32 bucket, old, now = (u32)jiffies;
atomic_t *p_id;
@@ -478,7 +479,6 @@ u32 ip_idents_reserve(u32 hash, int segs)
*/
return atomic_add_return(segs + delta, p_id) - segs;
}
-EXPORT_SYMBOL(ip_idents_reserve);
void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
@@ -498,24 +498,34 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
+static void ip_rt_fix_tos(struct flowi4 *fl4)
+{
+ __u8 tos = RT_FL_TOS(fl4);
+
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+ if (tos & RTO_ONLINK)
+ fl4->flowi4_scope = RT_SCOPE_LINK;
+}
+
static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
- const struct sock *sk,
- const struct iphdr *iph,
- int oif, u8 tos,
- u8 prot, u32 mark, int flow_flags)
+ const struct sock *sk, const struct iphdr *iph,
+ int oif, __u8 tos, u8 prot, u32 mark,
+ int flow_flags)
{
+ __u8 scope = RT_SCOPE_UNIVERSE;
+
if (sk) {
const struct inet_sock *inet = inet_sk(sk);
oif = sk->sk_bound_dev_if;
mark = sk->sk_mark;
- tos = RT_CONN_FLAGS(sk);
+ tos = ip_sock_rt_tos(sk);
+ scope = ip_sock_rt_scope(sk);
prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
}
- flowi4_init_output(fl4, oif, mark, tos,
- RT_SCOPE_UNIVERSE, prot,
- flow_flags,
- iph->daddr, iph->saddr, 0, 0,
+
+ flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
+ prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
sock_net_uid(net, sk));
}
@@ -525,9 +535,9 @@ static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
- u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
+ __u8 tos = iph->tos;
__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
@@ -543,7 +553,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
- RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
+ ip_sock_rt_scope(sk),
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
@@ -602,7 +613,7 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
static u32 fnhe_hashfun(__be32 daddr)
{
- static siphash_key_t fnhe_hash_key __read_mostly;
+ static siphash_aligned_key_t fnhe_hash_key;
u64 hval;
net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
@@ -816,9 +827,9 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
- u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
+ __u8 tos = iph->tos;
rt = (struct rtable *) dst;
@@ -935,6 +946,7 @@ static int ip_error(struct sk_buff *skb)
struct inet_peer *peer;
unsigned long now;
struct net *net;
+ SKB_DR(reason);
bool send;
int code;
@@ -954,10 +966,12 @@ static int ip_error(struct sk_buff *skb)
if (!IN_DEV_FORWARD(in_dev)) {
switch (rt->dst.error) {
case EHOSTUNREACH:
+ SKB_DR_SET(reason, IP_INADDRERRORS);
__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
break;
case ENETUNREACH:
+ SKB_DR_SET(reason, IP_INNOROUTES);
__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
}
@@ -973,6 +987,7 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
+ SKB_DR_SET(reason, IP_INNOROUTES);
__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
@@ -999,7 +1014,7 @@ static int ip_error(struct sk_buff *skb)
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
-out: kfree_skb(skb);
+out: kfree_skb_reason(skb, reason);
return 0;
}
@@ -1018,13 +1033,13 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
if (old_mtu < mtu)
return;
- if (mtu < ip_rt_min_pmtu) {
+ if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
- mtu = min(old_mtu, ip_rt_min_pmtu);
+ mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
}
if (rt->rt_pmtu == mtu && !lock &&
- time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
+ time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
return;
rcu_read_lock();
@@ -1034,7 +1049,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
fib_select_path(net, &res, fl4, NULL);
nhc = FIB_RES_NHC(res);
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
- jiffies + ip_rt_mtu_expires);
+ jiffies + net->ipv4.ip_rt_mtu_expires);
}
rcu_read_unlock();
}
@@ -1063,8 +1078,8 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
struct rtable *rt;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
- __build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, 0);
+ __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
+ 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1152,8 +1167,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, 0, 0);
+ __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
@@ -1297,9 +1311,10 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
+ struct net *net = dev_net(dst->dev);
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
- ip_rt_min_advmss);
+ net->ipv4.ip_rt_min_advmss);
return min(advmss, IPV4_MAX_PMTU - header_size);
}
@@ -1484,6 +1499,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
struct uncached_list {
spinlock_t lock;
struct list_head head;
+ struct list_head quarantine;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
@@ -1505,7 +1521,7 @@ void rt_del_uncached_list(struct rtable *rt)
struct uncached_list *ul = rt->rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del(&rt->rt_uncached);
+ list_del_init(&rt->rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
@@ -1520,19 +1536,24 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
void rt_flush_dev(struct net_device *dev)
{
- struct rtable *rt;
+ struct rtable *rt, *safe;
int cpu;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+ if (list_empty(&ul->head))
+ continue;
+
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
rt->dst.dev = blackhole_netdev;
- dev_hold(rt->dst.dev);
- dev_put(dev);
+ dev_replace_track(dev, blackhole_netdev,
+ &rt->dst.dev_tracker,
+ GFP_ATOMIC);
+ list_move(&rt->rt_uncached, &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
}
@@ -1706,6 +1727,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct in_device *in_dev = __in_dev_get_rcu(dev);
unsigned int flags = RTCF_MULTICAST;
struct rtable *rth;
+ bool no_policy;
u32 itag = 0;
int err;
@@ -1716,8 +1738,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (our)
flags |= RTCF_LOCAL;
+ no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
+ if (no_policy)
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
- IN_DEV_ORCONF(in_dev, NOPOLICY), false);
+ no_policy, false);
if (!rth)
return -ENOBUFS;
@@ -1733,6 +1759,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
RT_CACHE_STAT_INC(in_slow_mc);
+ skb_dst_drop(skb);
skb_dst_set(skb, &rth->dst);
return 0;
}
@@ -1775,7 +1802,7 @@ static int __mkroute_input(struct sk_buff *skb,
struct rtable *rth;
int err;
struct in_device *out_dev;
- bool do_cache;
+ bool do_cache, no_policy;
u32 itag = 0;
/* get a working reference to the output device */
@@ -1820,6 +1847,10 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
+ no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
+ if (no_policy)
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
fnhe = find_exception(nhc, daddr);
if (do_cache) {
if (fnhe)
@@ -1832,8 +1863,7 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
- rth = rt_dst_alloc(out_dev->dev, 0, res->type,
- IN_DEV_ORCONF(in_dev, NOPOLICY),
+ rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
IN_DEV_ORCONF(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
@@ -2208,6 +2238,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct rtable *rth;
struct flowi4 fl4;
bool do_cache = true;
+ bool no_policy;
/* IP on this device is disabled. */
@@ -2256,6 +2287,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
/*
* Now we are ready to route packet.
*/
+ fl4.flowi4_l3mdev = 0;
fl4.flowi4_oif = 0;
fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
@@ -2325,6 +2357,10 @@ brd_input:
RT_CACHE_STAT_INC(in_brd);
local_input:
+ no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
+ if (no_policy)
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
do_cache &= res->fi && !itag;
if (do_cache) {
struct fib_nh_common *nhc = FIB_RES_NHC(*res);
@@ -2339,7 +2375,7 @@ local_input:
rth = rt_dst_alloc(ip_rt_get_dev(net, res),
flags | RTCF_LOCAL, res->type,
- IN_DEV_ORCONF(in_dev, NOPOLICY), false);
+ no_policy, false);
if (!rth)
goto e_nobufs;
@@ -2601,7 +2637,6 @@ add:
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
const struct sk_buff *skb)
{
- __u8 tos = RT_FL_TOS(fl4);
struct fib_result res = {
.type = RTN_UNSPEC,
.fi = NULL,
@@ -2611,9 +2646,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- fl4->flowi4_tos = tos & IPTOS_RT_MASK;
- fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
- RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+ ip_rt_fix_tos(fl4);
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -2731,8 +2764,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
res->fi = NULL;
res->table = NULL;
if (fl4->flowi4_oif &&
- (ipv4_is_multicast(fl4->daddr) ||
- !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
+ (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
/* Apparently, routing tables are wrong. Assume,
* that the destination is on link.
*
@@ -2819,7 +2851,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->output = dst_discard_out;
new->dev = net->loopback_dev;
- dev_hold(new->dev);
+ dev_hold_track(new->dev, &new->dev_tracker, GFP_ATOMIC);
rt->rt_is_input = ort->rt_is_input;
rt->rt_iif = ort->rt_iif;
@@ -3377,7 +3409,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fri.tb_id = table_id;
fri.dst = res.prefix;
fri.dst_len = res.prefixlen;
- fri.tos = fl4.flowi4_tos;
+ fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
fri.type = rt->rt_type;
fri.offload = 0;
fri.trap = 0;
@@ -3390,11 +3422,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (fa->fa_slen == slen &&
fa->tb_id == fri.tb_id &&
- fa->fa_tos == fri.tos &&
+ fa->fa_dscp == fri.dscp &&
fa->fa_info == res.fi &&
fa->fa_type == fri.type) {
- fri.offload = fa->offload;
- fri.trap = fa->trap;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
break;
}
}
@@ -3533,40 +3565,40 @@ static struct ctl_table ipv4_route_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "mtu_expires",
- .data = &ip_rt_mtu_expires,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "min_pmtu",
- .data = &ip_rt_min_pmtu,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &ip_min_valid_pmtu,
- },
- {
- .procname = "min_adv_mss",
- .data = &ip_rt_min_advmss,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
{ }
};
static const char ipv4_route_flush_procname[] = "flush";
-static struct ctl_table ipv4_route_flush_table[] = {
+static struct ctl_table ipv4_route_netns_table[] = {
{
.procname = ipv4_route_flush_procname,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = ipv4_sysctl_rtcache_flush,
},
+ {
+ .procname = "min_pmtu",
+ .data = &init_net.ipv4.ip_rt_min_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_min_valid_pmtu,
+ },
+ {
+ .procname = "mtu_expires",
+ .data = &init_net.ipv4.ip_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "min_adv_mss",
+ .data = &init_net.ipv4.ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ },
};
@@ -3574,9 +3606,11 @@ static __net_init int sysctl_route_net_init(struct net *net)
{
struct ctl_table *tbl;
- tbl = ipv4_route_flush_table;
+ tbl = ipv4_route_netns_table;
if (!net_eq(net, &init_net)) {
- tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
+ int i;
+
+ tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
if (!tbl)
goto err_dup;
@@ -3585,6 +3619,12 @@ static __net_init int sysctl_route_net_init(struct net *net)
if (tbl[0].procname != ipv4_route_flush_procname)
tbl[0].procname = NULL;
}
+
+ /* Update the variables to point into the current struct net
+ * except for the first element flush
+ */
+ for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
+ tbl[i].data += (void *)net - (void *)&init_net;
}
tbl[0].extra1 = net;
@@ -3594,7 +3634,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
return 0;
err_reg:
- if (tbl != ipv4_route_flush_table)
+ if (tbl != ipv4_route_netns_table)
kfree(tbl);
err_dup:
return -ENOMEM;
@@ -3606,7 +3646,7 @@ static __net_exit void sysctl_route_net_exit(struct net *net)
tbl = net->ipv4.route_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.route_hdr);
- BUG_ON(tbl == ipv4_route_flush_table);
+ BUG_ON(tbl == ipv4_route_netns_table);
kfree(tbl);
}
@@ -3616,6 +3656,19 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
};
#endif
+static __net_init int netns_ip_rt_init(struct net *net)
+{
+ /* Set default value for namespaceified sysctls */
+ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
+ net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+ net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
+ return 0;
+}
+
+static struct pernet_operations __net_initdata ip_rt_ops = {
+ .init = netns_ip_rt_init,
+};
+
static __net_init int rt_genid_init(struct net *net)
{
atomic_set(&net->ipv4.rt_genid, 0);
@@ -3683,6 +3736,7 @@ int __init ip_rt_init(void)
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
+ INIT_LIST_HEAD(&ul->quarantine);
spin_lock_init(&ul->lock);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -3721,6 +3775,7 @@ int __init ip_rt_init(void)
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#endif
+ register_pernet_subsys(&ip_rt_ops);
register_pernet_subsys(&rt_genid_ops);
register_pernet_subsys(&ipv4_inetpeer_ops);
return 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 8696dc343ad2..b387c4835155 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -14,7 +14,7 @@
#include <net/tcp.h>
#include <net/route.h>
-static siphash_key_t syncookie_secret[2] __read_mostly;
+static siphash_aligned_key_t syncookie_secret[2];
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -273,7 +273,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
if (!ecn_ok)
return false;
- if (net->ipv4.sysctl_tcp_ecn)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_ecn))
return true;
return dst_feature(dst, RTAX_FEATURE_ECN);
@@ -281,6 +281,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
EXPORT_SYMBOL(cookie_ecn_ok);
struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+ const struct tcp_request_sock_ops *af_ops,
struct sock *sk,
struct sk_buff *skb)
{
@@ -297,6 +298,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
return NULL;
treq = tcp_rsk(req);
+
+ /* treq->af_specific might be used to perform TCP_MD5 lookup */
+ treq->af_specific = af_ops;
+
treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
@@ -364,7 +369,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb);
+ req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, skb);
if (!req)
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 97eb54774924..108fd86f2718 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -20,10 +20,6 @@
#include <net/protocol.h>
#include <net/netevent.h>
-static int two = 2;
-static int three __maybe_unused = 3;
-static int four = 4;
-static int thousand = 1000;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -589,12 +585,22 @@ static struct ctl_table ipv4_table[] = {
};
static struct ctl_table ipv4_net_table[] = {
+ /* tcp_max_tw_buckets must be first in this table. */
+ {
+ .procname = "tcp_max_tw_buckets",
+/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{
.procname = "icmp_echo_ignore_all",
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_echo_enable_probe",
@@ -611,6 +617,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_ignore_bogus_error_responses",
@@ -618,6 +626,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_errors_use_inbound_ifaddr",
@@ -625,6 +635,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_ratelimit",
@@ -664,6 +676,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "tcp_ecn_fallback",
@@ -671,6 +685,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ip_dynaddr",
@@ -998,14 +1014,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
- },
- {
- .procname = "tcp_max_tw_buckets",
- .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "tcp_max_syn_backlog",
@@ -1058,7 +1067,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
.extra1 = SYSCTL_ZERO,
- .extra2 = &three,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "fib_multipath_hash_fields",
@@ -1116,7 +1125,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &four,
+ .extra2 = SYSCTL_FOUR,
},
{
.procname = "tcp_recovery",
@@ -1271,6 +1280,13 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ONE,
},
{
+ .procname = "tcp_tso_rtt_log",
+ .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
.procname = "tcp_min_rtt_wlen",
.data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
@@ -1302,7 +1318,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &thousand,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
.procname = "tcp_pacing_ca_ratio",
@@ -1311,7 +1327,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &thousand,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
.procname = "tcp_wmem",
@@ -1383,7 +1399,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_TWO,
},
{ }
};
@@ -1400,7 +1416,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table)
goto err_alloc;
- for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+ /* skip first entry (sysctl_max_tw_buckets) */
+ for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
if (table[i].data) {
/* Update the variables to point into
* the current struct net
@@ -1415,6 +1432,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
}
}
+ table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets;
+
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
if (!net->ipv4.ipv4_hdr)
goto err_reg;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bbb3d39c69af..2222dfdde316 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,7 +292,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
+atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
#if IS_ENABLED(CONFIG_SMC)
@@ -303,7 +303,7 @@ EXPORT_SYMBOL(tcp_have_smc);
/*
* Current number of TCP sockets.
*/
-struct percpu_counter tcp_sockets_allocated;
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(tcp_sockets_allocated);
/*
@@ -429,7 +429,7 @@ void tcp_init_sock(struct sock *sk)
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
- tp->snd_cwnd = TCP_INIT_CWND;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
@@ -456,7 +456,6 @@ void tcp_init_sock(struct sock *sk)
WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
sk_sockets_allocated_inc(sk);
- sk->sk_route_forced_caps = NETIF_F_GSO;
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -546,10 +545,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (state != TCP_SYN_SENT &&
(state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
+ u16 urg_data = READ_ONCE(tp->urg_data);
- if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->urg_data)
+ if (unlikely(urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+ !sock_flag(sk, SOCK_URGINLINE))
target++;
if (tcp_stream_is_readable(sk, target))
@@ -574,7 +574,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
} else
mask |= EPOLLOUT | EPOLLWRNORM;
- if (tp->urg_data & TCP_URG_VALID)
+ if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
/* Active TCP fastopen socket with defer_connect
@@ -608,7 +608,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data &&
+ answ = READ_ONCE(tp->urg_data) &&
READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
@@ -688,7 +688,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
return skb->len < size_goal &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
!tcp_rtx_queue_empty(sk) &&
- refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
+ refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
+ tcp_skb_can_collapse_to(skb);
}
void tcp_push(struct sock *sk, int flags, int mss_now,
@@ -893,8 +894,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return mss_now;
/* Note : tcp_tso_autosize() will eventually split this later */
- new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
- new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
/* We try hard to avoid divides here */
size_goal = tp->gso_segs * mss_now;
@@ -936,6 +936,22 @@ void tcp_remove_empty_skb(struct sock *sk)
}
}
+/* skb changing from pure zc to mixed, must charge zc */
+static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(skb_zcopy_pure(skb))) {
+ u32 extra = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
+ if (!sk_wmem_schedule(sk, extra))
+ return -ENOMEM;
+
+ sk_mem_charge(sk, extra);
+ skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+ }
+ return 0;
+}
+
static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
struct page *page, int offset, size_t *size)
{
@@ -971,7 +987,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (!sk_wmem_schedule(sk, copy))
+ if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy))
return NULL;
if (can_coalesce) {
@@ -1319,16 +1335,8 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- /* skb changing from pure zc to mixed, must charge zc */
- if (unlikely(skb_zcopy_pure(skb))) {
- if (!sk_wmem_schedule(sk, skb->data_len))
- goto wait_for_space;
-
- sk_mem_charge(sk, skb->data_len);
- skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
- }
-
- if (!sk_wmem_schedule(sk, copy))
+ if (tcp_downgrade_zcopy_pure(sk, skb) ||
+ !sk_wmem_schedule(sk, copy))
goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
@@ -1466,7 +1474,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = TCP_URG_READ;
+ WRITE_ONCE(tp->urg_data, TCP_URG_READ);
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
@@ -1580,6 +1588,18 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return skb_attempt_defer_free(skb);
+ }
+ __kfree_skb(skb);
+}
+
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1599,7 +1619,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
@@ -1633,7 +1653,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
len = skb->len - offset;
/* Stop reading if we hit a patch of urgent data */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
if (urg_offset < len)
len = urg_offset;
@@ -1645,11 +1665,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
if (!copied)
copied = used;
break;
- } else if (used <= len) {
- seq += used;
- copied += used;
- offset += used;
}
+ if (WARN_ON_ONCE(used > len))
+ used = len;
+ seq += used;
+ copied += used;
+ offset += used;
+
/* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
@@ -1665,11 +1687,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
continue;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
WRITE_ONCE(tp->copied_seq, seq);
@@ -1836,8 +1858,7 @@ static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
}
static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags,
- struct scm_timestamping_internal *tss,
+ int flags, struct scm_timestamping_internal *tss,
int *cmsg_flags);
static int receive_fallback_to_copy(struct sock *sk,
struct tcp_zerocopy_receive *zc, int inq,
@@ -1859,7 +1880,7 @@ static int receive_fallback_to_copy(struct sock *sk,
if (err)
return err;
- err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
+ err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
tss, &zc->msg_flags);
if (err < 0)
return err;
@@ -2275,8 +2296,7 @@ static int tcp_inq_hint(struct sock *sk)
*/
static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags,
- struct scm_timestamping_internal *tss,
+ int flags, struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2294,9 +2314,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
if (sk->sk_state == TCP_LISTEN)
goto out;
- if (tp->recvmsg_inq)
+ if (tp->recvmsg_inq) {
*cmsg_flags = TCP_CMSG_INQ;
- timeo = sock_rcvtimeo(sk, nonblock);
+ msg->msg_get_inq = 1;
+ }
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
@@ -2329,7 +2351,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
- if (tp->urg_data && tp->urg_seq == *seq) {
+ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
@@ -2372,10 +2394,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
break;
if (copied) {
- if (sk->sk_err ||
+ if (!timeo ||
+ sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
- !timeo ||
signal_pending(current))
break;
} else {
@@ -2409,13 +2431,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}
}
- tcp_cleanup_rbuf(sk, copied);
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
- release_sock(sk);
- lock_sock(sk);
+ __sk_flush_backlog(sk);
} else {
+ tcp_cleanup_rbuf(sk, copied);
sk_wait_data(sk, &timeo, last);
}
@@ -2435,7 +2455,7 @@ found_ok_skb:
used = len;
/* Do we have urgent data here? */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
@@ -2469,8 +2489,8 @@ found_ok_skb:
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
- tp->urg_data = 0;
+ if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+ WRITE_ONCE(tp->urg_data, 0);
tcp_fast_path_check(sk);
}
@@ -2485,14 +2505,14 @@ skip_copy:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
@@ -2516,10 +2536,10 @@ recv_sndq:
goto out;
}
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len)
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
- int cmsg_flags = 0, ret, inq;
+ int cmsg_flags = 0, ret;
struct scm_timestamping_internal tss;
if (unlikely(flags & MSG_ERRQUEUE))
@@ -2528,19 +2548,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk_can_busy_loop(sk) &&
skb_queue_empty_lockless(&sk->sk_receive_queue) &&
sk->sk_state == TCP_ESTABLISHED)
- sk_busy_loop(sk, nonblock);
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
lock_sock(sk);
- ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
- &cmsg_flags);
+ ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
release_sock(sk);
- if (cmsg_flags && ret >= 0) {
+ if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
tcp_recv_timestamp(msg, sk, &tss);
- if (cmsg_flags & TCP_CMSG_INQ) {
- inq = tcp_inq_hint(sk);
- put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+ if (msg->msg_get_inq) {
+ msg->msg_inq = tcp_inq_hint(sk);
+ if (cmsg_flags & TCP_CMSG_INQ)
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
+ sizeof(msg->msg_inq), &msg->msg_inq);
}
}
return ret;
@@ -2694,7 +2715,8 @@ static void tcp_orphan_update(struct timer_list *unused)
static bool tcp_too_many_orphans(int shift)
{
- return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+ return READ_ONCE(tcp_orphan_cache) << shift >
+ READ_ONCE(sysctl_tcp_max_orphans);
}
bool tcp_check_oom(struct sock *sk, int shift)
@@ -2964,7 +2986,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
- tp->urg_data = 0;
+ WRITE_ONCE(tp->urg_data, 0);
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -2992,7 +3014,7 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_rto_min = TCP_RTO_MIN;
icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- tp->snd_cwnd = TCP_INIT_CWND;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
tp->delivered = 0;
@@ -3012,8 +3034,7 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- dst_release(sk->sk_rx_dst);
- sk->sk_rx_dst = NULL;
+ dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
tp->segs_in = 0;
@@ -3059,7 +3080,6 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
-
sk_error_report(sk);
return 0;
}
@@ -3177,7 +3197,7 @@ static void tcp_enable_tx_delay(void)
* TCP_CORK can be set together with TCP_NODELAY and it is stronger than
* TCP_NODELAY.
*/
-static void __tcp_sock_set_cork(struct sock *sk, bool on)
+void __tcp_sock_set_cork(struct sock *sk, bool on)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -3205,7 +3225,7 @@ EXPORT_SYMBOL(tcp_sock_set_cork);
* However, when TCP_NODELAY is set we make an explicit push, which overrides
* even TCP_CORK for currently queued segments.
*/
-static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+void __tcp_sock_set_nodelay(struct sock *sk, bool on)
{
if (on) {
tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
@@ -3704,7 +3724,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
if (info->tcpi_state == TCP_LISTEN) {
/* listeners aliased fields :
@@ -3774,10 +3794,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
tcp_get_info_chrono_stats(tp, info);
info->tcpi_segs_out = tp->segs_out;
- info->tcpi_segs_in = tp->segs_in;
+
+ /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+ info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+ info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
info->tcpi_min_rtt = tcp_min_rtt(tp);
- info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
@@ -3873,7 +3895,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
rate64 = tcp_compute_delivery_rate(tp);
nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
- nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
@@ -4391,6 +4413,73 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke
}
EXPORT_SYMBOL(tcp_md5_hash_key);
+/* Called with rcu_read_lock() */
+enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int dif, int sdif)
+{
+ /*
+ * This gets called for each TCP segment that arrives
+ * so we want to be efficient.
+ * We have 3 drop cases:
+ * o No MD5 hash and one expected.
+ * o MD5 hash and we're not expecting one.
+ * o MD5 hash and its wrong.
+ */
+ const __u8 *hash_location = NULL;
+ struct tcp_md5sig_key *hash_expected;
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int genhash, l3index;
+ u8 newhash[16];
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
+ hash_location = tcp_parse_md5sig_option(th);
+
+ /* We've parsed the options - do we have a hash? */
+ if (!hash_expected && !hash_location)
+ return SKB_NOT_DROPPED_YET;
+
+ if (hash_expected && !hash_location) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+ }
+
+ if (!hash_expected && hash_location) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
+ }
+
+ /* check the signature */
+ genhash = tp->af_specific->calc_md5_hash(newhash, hash_expected,
+ NULL, skb);
+
+ if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
+ if (family == AF_INET) {
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
+ saddr, ntohs(th->source),
+ daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "", l3index);
+ } else {
+ net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
+ genhash ? "failed" : "mismatch",
+ saddr, ntohs(th->source),
+ daddr, ntohs(th->dest), l3index);
+ }
+ return SKB_DROP_REASON_TCP_MD5FAILURE;
+ }
+ return SKB_NOT_DROPPED_YET;
+}
+EXPORT_SYMBOL(tcp_inbound_md5_hash);
+
#endif
void tcp_done(struct sock *sk)
@@ -4507,7 +4596,6 @@ void __init tcp_init(void)
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
- inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/
0, 64 * 1024);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index ec5550089b4d..075e744bfb48 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -276,7 +276,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
} else { /* no RTT sample yet */
rtt_us = USEC_PER_MSEC; /* use nominal default RTT */
}
- bw = (u64)tp->snd_cwnd * BW_UNIT;
+ bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
do_div(bw, rtt_us);
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
}
@@ -310,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
*/
bytes = min_t(unsigned long,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
- GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
+ GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
return min(segs, 0x7FU);
@@ -323,9 +323,9 @@ static void bbr_save_cwnd(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
- bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ bbr->prior_cwnd = tcp_snd_cwnd(tp); /* this cwnd is good enough */
else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
- bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
}
static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
@@ -482,7 +482,7 @@ static bool bbr_set_cwnd_to_recover_or_restore(
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
- u32 cwnd = tp->snd_cwnd;
+ u32 cwnd = tcp_snd_cwnd(tp);
/* An ACK for P pkts should release at most 2*P packets. We do this
* in two steps. First, here we deduct the number of lost packets.
@@ -520,7 +520,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 cwnd = tp->snd_cwnd, target_cwnd = 0;
+ u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
if (!acked)
goto done; /* no packet fully ACKed; just apply caps */
@@ -544,9 +544,9 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
cwnd = max(cwnd, bbr_cwnd_min_target);
done:
- tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */
if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
- tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
}
/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
@@ -856,7 +856,7 @@ static void bbr_update_ack_aggregation(struct sock *sk,
bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
bbr->ack_epoch_acked + rs->acked_sacked);
extra_acked = bbr->ack_epoch_acked - expected_acked;
- extra_acked = min(extra_acked, tp->snd_cwnd);
+ extra_acked = min(extra_acked, tcp_snd_cwnd(tp));
if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
}
@@ -914,7 +914,7 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
return;
bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */
- tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
bbr_reset_mode(sk);
}
@@ -1093,7 +1093,7 @@ static u32 bbr_undo_cwnd(struct sock *sk)
bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
bbr->full_bw_cnt = 0;
bbr_reset_lt_bw_sampling(sk);
- return tcp_sk(sk)->snd_cwnd;
+ return tcp_snd_cwnd(tcp_sk(sk));
}
/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
@@ -1154,7 +1154,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.set_state = bbr_set_state,
};
-BTF_SET_START(tcp_bbr_kfunc_ids)
+BTF_SET_START(tcp_bbr_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, bbr_init)
@@ -1167,25 +1167,27 @@ BTF_ID(func, bbr_min_tso_segs)
BTF_ID(func, bbr_set_state)
#endif
#endif
-BTF_SET_END(tcp_bbr_kfunc_ids)
+BTF_SET_END(tcp_bbr_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_bbr_check_kfunc_ids,
+};
static int __init bbr_register(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
- ret = tcp_register_congestion_control(&tcp_bbr_cong_ops);
- if (ret)
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&tcp_bbr_cong_ops);
}
static void __exit bbr_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set);
tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f5f588b1f6e9..58358bf92e1b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!acked)
return;
}
- bictcp_update(ca, tp->snd_cwnd);
+ bictcp_update(ca, tcp_snd_cwnd(tp));
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
@@ -166,16 +166,16 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
/* Wmax and fast convergence */
- if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
- ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
/ (2 * BICTCP_BETA_SCALE);
else
- ca->last_max_cwnd = tp->snd_cwnd;
+ ca->last_max_cwnd = tcp_snd_cwnd(tp);
- if (tp->snd_cwnd <= low_window)
- return max(tp->snd_cwnd >> 1U, 2U);
+ if (tcp_snd_cwnd(tp) <= low_window)
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
else
- return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+ return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f70aa0932bd6..0d3f68bb51c0 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -138,10 +138,9 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
struct sk_psock *psock = sk_psock_get(sk);
int ret;
- if (unlikely(!psock)) {
- sk_msg_free(sk, msg);
- return 0;
- }
+ if (unlikely(!psock))
+ return -EPIPE;
+
ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
tcp_bpf_push_locked(sk, msg, bytes, flags, false);
sk_psock_put(sk, psock);
@@ -175,7 +174,6 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
static int tcp_bpf_recvmsg_parser(struct sock *sk,
struct msghdr *msg,
size_t len,
- int nonblock,
int flags,
int *addr_len)
{
@@ -187,7 +185,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
psock = sk_psock_get(sk);
if (unlikely(!psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
lock_sock(sk);
msg_bytes_ready:
@@ -196,19 +194,46 @@ msg_bytes_ready:
long timeo;
int data;
- timeo = sock_rcvtimeo(sk, nonblock);
+ if (sock_flag(sk, SOCK_DONE))
+ goto out;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ goto out;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ copied = -ENOTCONN;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ if (!timeo) {
+ copied = -EAGAIN;
+ goto out;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ goto out;
+ }
+
data = tcp_msg_wait_data(sk, psock, timeo);
if (data && !sk_psock_queue_empty(psock))
goto msg_bytes_ready;
copied = -EAGAIN;
}
+out:
release_sock(sk);
sk_psock_put(sk, psock);
return copied;
}
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct sk_psock *psock;
int copied, ret;
@@ -218,11 +243,11 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
psock = sk_psock_get(sk);
if (unlikely(!psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
if (!skb_queue_empty(&sk->sk_receive_queue) &&
sk_psock_queue_empty(psock)) {
sk_psock_put(sk, psock);
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
}
lock_sock(sk);
msg_bytes_ready:
@@ -231,14 +256,14 @@ msg_bytes_ready:
long timeo;
int data;
- timeo = sock_rcvtimeo(sk, nonblock);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
data = tcp_msg_wait_data(sk, psock, timeo);
if (data) {
if (!sk_psock_queue_empty(psock))
goto msg_bytes_ready;
release_sock(sk);
sk_psock_put(sk, psock);
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
}
copied = -EAGAIN;
}
@@ -308,7 +333,7 @@ more_data:
cork = true;
psock->cork = NULL;
}
- sk_msg_return(sk, msg, tosend);
+ sk_msg_return(sk, msg, msg->sg.size);
release_sock(sk);
ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
@@ -348,8 +373,11 @@ more_data:
}
if (msg &&
msg->sg.data[msg->sg.start].page_link &&
- msg->sg.data[msg->sg.start].length)
+ msg->sg.data[msg->sg.start].length) {
+ if (eval == __SK_REDIRECT)
+ sk_mem_charge(sk, msg->sg.size);
goto more_data;
+ }
}
return ret;
}
@@ -583,9 +611,6 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
return 0;
}
- if (inet_csk_has_ulp(sk))
- return -EINVAL;
-
if (sk->sk_family == AF_INET6) {
if (tcp_bpf_assert_proto_ops(psock->sk_proto))
return -EINVAL;
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 709d23801823..ddc7ba0554bd 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -161,8 +161,8 @@ static void tcp_cdg_hystart_update(struct sock *sk)
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
return;
}
}
@@ -180,8 +180,8 @@ static void tcp_cdg_hystart_update(struct sock *sk)
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
}
}
@@ -252,7 +252,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
return false;
}
- ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd);
+ ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp));
ca->state = CDG_BACKOFF;
tcp_enter_cwr(sk);
return true;
@@ -285,14 +285,14 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
if (!tcp_is_cwnd_limited(sk)) {
- ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd);
+ ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp));
return;
}
- prior_snd_cwnd = tp->snd_cwnd;
+ prior_snd_cwnd = tcp_snd_cwnd(tp);
tcp_reno_cong_avoid(sk, ack, acked);
- incr = tp->snd_cwnd - prior_snd_cwnd;
+ incr = tcp_snd_cwnd(tp) - prior_snd_cwnd;
ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
}
@@ -331,15 +331,15 @@ static u32 tcp_cdg_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (ca->state == CDG_BACKOFF)
- return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
+ return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10);
if (ca->state == CDG_NONFULL && use_tolerance)
- return tp->snd_cwnd;
+ return tcp_snd_cwnd(tp);
- ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd);
+ ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp));
if (use_shadow)
- return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1);
- return max(2U, tp->snd_cwnd >> 1);
+ return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1);
+ return max(2U, tcp_snd_cwnd(tp) >> 1);
}
static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
@@ -357,7 +357,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
ca->gradients = gradients;
ca->rtt_seq = tp->snd_nxt;
- ca->shadow_wnd = tp->snd_cwnd;
+ ca->shadow_wnd = tcp_snd_cwnd(tp);
break;
case CA_EVENT_COMPLETE_CWR:
ca->state = CDG_UNKNOWN;
@@ -380,7 +380,7 @@ static void tcp_cdg_init(struct sock *sk)
ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
GFP_NOWAIT | __GFP_NOWARN);
ca->rtt_seq = tp->snd_nxt;
- ca->shadow_wnd = tp->snd_cwnd;
+ ca->shadow_wnd = tcp_snd_cwnd(tp);
}
static void tcp_cdg_release(struct sock *sk)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db5831e6c136..d3cae40749e8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>
+#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
@@ -33,6 +34,17 @@ struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL;
}
+void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ trace_tcp_cong_state_set(sk, ca_state);
+
+ if (icsk->icsk_ca_ops->set_state)
+ icsk->icsk_ca_ops->set_state(sk, ca_state);
+ icsk->icsk_ca_state = ca_state;
+}
+
/* Must be called with rcu lock held */
static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
const char *name)
@@ -135,7 +147,6 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
return key;
}
-EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
@@ -151,7 +162,6 @@ char *tcp_ca_get_name_by_key(u32 key, char *buffer)
return ret;
}
-EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
@@ -395,10 +405,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
*/
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
+ u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
- acked -= cwnd - tp->snd_cwnd;
- tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ acked -= cwnd - tcp_snd_cwnd(tp);
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
return acked;
}
@@ -412,7 +422,7 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
/* If credits accumulated at a higher w, apply them gently now. */
if (tp->snd_cwnd_cnt >= w) {
tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
}
tp->snd_cwnd_cnt += acked;
@@ -420,9 +430,9 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
u32 delta = tp->snd_cwnd_cnt / w;
tp->snd_cwnd_cnt -= delta * w;
- tp->snd_cwnd += delta;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
}
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
@@ -447,7 +457,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
}
/* In dangerous area, increase slowly. */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
@@ -456,7 +466,7 @@ u32 tcp_reno_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd >> 1U, 2U);
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
@@ -464,7 +474,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd, tp->prior_cwnd);
+ return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index e07837e23b3f..68178e7280ce 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -334,7 +334,7 @@ static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!acked)
return;
}
- bictcp_update(ca, tp->snd_cwnd, acked);
+ bictcp_update(ca, tcp_snd_cwnd(tp), acked);
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
@@ -346,13 +346,13 @@ static u32 cubictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
/* Wmax and fast convergence */
- if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
- ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
/ (2 * BICTCP_BETA_SCALE);
else
- ca->last_max_cwnd = tp->snd_cwnd;
+ ca->last_max_cwnd = tcp_snd_cwnd(tp);
- return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+ return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
static void cubictcp_state(struct sock *sk, u8 new_state)
@@ -372,7 +372,7 @@ static void cubictcp_state(struct sock *sk, u8 new_state)
* We apply another 100% factor because @rate is doubled at this point.
* We cap the cushion to 1ms.
*/
-static u32 hystart_ack_delay(struct sock *sk)
+static u32 hystart_ack_delay(const struct sock *sk)
{
unsigned long rate;
@@ -380,7 +380,7 @@ static u32 hystart_ack_delay(struct sock *sk)
if (!rate)
return 0;
return min_t(u64, USEC_PER_MSEC,
- div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
+ div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate));
}
static void hystart_update(struct sock *sk, u32 delay)
@@ -413,13 +413,13 @@ static void hystart_update(struct sock *sk, u32 delay)
ca->found = 1;
pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
now - ca->round_start, threshold,
- ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
+ ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp));
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
}
}
@@ -438,8 +438,8 @@ static void hystart_update(struct sock *sk, u32 delay)
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
}
}
@@ -469,7 +469,7 @@ static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
/* hystart triggers when cwnd is larger than some threshold */
if (!ca->found && tcp_in_slow_start(tp) && hystart &&
- tp->snd_cwnd >= hystart_low_window)
+ tcp_snd_cwnd(tp) >= hystart_low_window)
hystart_update(sk, delay);
}
@@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
.name = "cubic",
};
-BTF_SET_START(tcp_cubic_kfunc_ids)
+BTF_SET_START(tcp_cubic_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, cubictcp_init)
@@ -496,9 +496,12 @@ BTF_ID(func, cubictcp_cwnd_event)
BTF_ID(func, cubictcp_acked)
#endif
#endif
-BTF_SET_END(tcp_cubic_kfunc_ids)
+BTF_SET_END(tcp_cubic_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_cubic_check_kfunc_ids,
+};
static int __init cubictcp_register(void)
{
@@ -534,16 +537,14 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
- ret = tcp_register_congestion_control(&cubictcp);
- if (ret)
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&cubictcp);
}
static void __exit cubictcp_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set);
tcp_unregister_congestion_control(&cubictcp);
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 0d7ab3cc7b61..ab034a4e9324 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -106,8 +106,8 @@ static u32 dctcp_ssthresh(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- ca->loss_cwnd = tp->snd_cwnd;
- return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+ ca->loss_cwnd = tcp_snd_cwnd(tp);
+ return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
}
static void dctcp_update_alpha(struct sock *sk, u32 flags)
@@ -148,8 +148,8 @@ static void dctcp_react_to_loss(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- ca->loss_cwnd = tp->snd_cwnd;
- tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
+ ca->loss_cwnd = tcp_snd_cwnd(tp);
+ tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
static void dctcp_state(struct sock *sk, u8 new_state)
@@ -211,8 +211,9 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
static u32 dctcp_cwnd_undo(struct sock *sk)
{
const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+ return max(tcp_snd_cwnd(tp), ca->loss_cwnd);
}
static struct tcp_congestion_ops dctcp __read_mostly = {
@@ -238,7 +239,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.name = "dctcp-reno",
};
-BTF_SET_START(tcp_dctcp_kfunc_ids)
+BTF_SET_START(tcp_dctcp_check_kfunc_ids)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID(func, dctcp_init)
@@ -249,25 +250,27 @@ BTF_ID(func, dctcp_cwnd_undo)
BTF_ID(func, dctcp_state)
#endif
#endif
-BTF_SET_END(tcp_dctcp_kfunc_ids)
+BTF_SET_END(tcp_dctcp_check_kfunc_ids)
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set);
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .check_set = &tcp_dctcp_check_kfunc_ids,
+};
static int __init dctcp_register(void)
{
int ret;
BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
- ret = tcp_register_congestion_control(&dctcp);
- if (ret)
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+ if (ret < 0)
return ret;
- register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
- return 0;
+ return tcp_register_congestion_control(&dctcp);
}
static void __exit dctcp_unregister(void)
{
- unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set);
tcp_unregister_congestion_control(&dctcp);
}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 349069d6cd0a..c6de5ce79ad3 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -127,22 +127,22 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* snd_cwnd <=
* hstcp_aimd_vals[ca->ai].cwnd
*/
- if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
- while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ if (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd &&
ca->ai < HSTCP_AIMD_MAX - 1)
ca->ai++;
- } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
- while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
+ } else if (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+ while (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd)
ca->ai--;
}
/* Do additive increase */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
/* cwnd = cwnd + a(w) / cwnd */
tp->snd_cwnd_cnt += ca->ai + 1;
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd_cnt -= tp->snd_cwnd;
- tp->snd_cwnd++;
+ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ tp->snd_cwnd_cnt -= tcp_snd_cwnd(tp);
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
}
}
}
@@ -154,7 +154,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
struct hstcp *ca = inet_csk_ca(sk);
/* Do multiplicative decrease */
- return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+ return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 55adcfcf96fe..52b1f2665dfa 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -124,7 +124,7 @@ static void measure_achieved_throughput(struct sock *sk,
ca->packetcount += sample->pkts_acked;
- if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+ if (ca->packetcount >= tcp_snd_cwnd(tp) - (ca->alpha >> 7 ? : 1) &&
now - ca->lasttime >= ca->minRTT &&
ca->minRTT > 0) {
__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
@@ -225,7 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
const struct htcp *ca = inet_csk_ca(sk);
htcp_param_update(sk);
- return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+ return max((tcp_snd_cwnd(tp) * ca->beta) >> 7, 2U);
}
static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -242,9 +242,9 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
- if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
+ if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tcp_snd_cwnd(tp)) {
+ if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tp->snd_cwnd_cnt = 0;
htcp_alpha_update(ca);
} else
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index be39327e04e6..abd7d91807e5 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -54,7 +54,7 @@ static void hybla_init(struct sock *sk)
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
ca->hybla_en = true;
- tp->snd_cwnd = 2;
+ tcp_snd_cwnd_set(tp, 2);
tp->snd_cwnd_clamp = 65535;
/* 1st Rho measurement based on initial srtt */
@@ -62,7 +62,7 @@ static void hybla_init(struct sock *sk)
/* set minimum rtt as this is the 1st ever seen */
ca->minrtt_us = tp->srtt_us;
- tp->snd_cwnd = ca->rho;
+ tcp_snd_cwnd_set(tp, ca->rho);
}
static void hybla_state(struct sock *sk, u8 ca_state)
@@ -137,31 +137,31 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* as long as increment is estimated as (rho<<7)/window
* it already is <<7 and we can easily count its fractions.
*/
- increment = ca->rho2_7ls / tp->snd_cwnd;
+ increment = ca->rho2_7ls / tcp_snd_cwnd(tp);
if (increment < 128)
tp->snd_cwnd_cnt++;
}
odd = increment % 128;
- tp->snd_cwnd += increment >> 7;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7));
ca->snd_cwnd_cents += odd;
/* check when fractions goes >=128 and increase cwnd by 1. */
while (ca->snd_cwnd_cents >= 128) {
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
ca->snd_cwnd_cents -= 128;
tp->snd_cwnd_cnt = 0;
}
/* check when cwnd has not been incremented for a while */
- if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
+ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tp->snd_cwnd_cnt = 0;
}
/* clamp down slowstart cwnd to ssthresh value. */
if (is_slowstart)
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh));
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
}
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 00e54873213e..c0c81a2c77fa 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -224,7 +224,7 @@ static void update_params(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
- if (tp->snd_cwnd < win_thresh) {
+ if (tcp_snd_cwnd(tp) < win_thresh) {
ca->alpha = ALPHA_BASE;
ca->beta = BETA_BASE;
} else if (ca->cnt_rtt > 0) {
@@ -284,9 +284,9 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* tp->snd_cwnd += alpha/tp->snd_cwnd
*/
delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
- if (delta >= tp->snd_cwnd) {
- tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
- (u32)tp->snd_cwnd_clamp);
+ if (delta >= tcp_snd_cwnd(tp)) {
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp) + delta / tcp_snd_cwnd(tp),
+ (u32)tp->snd_cwnd_clamp));
tp->snd_cwnd_cnt = 0;
}
}
@@ -296,9 +296,11 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
+ u32 decr;
/* Multiplicative decrease */
- return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
+ decr = (tcp_snd_cwnd(tp) * ca->beta) >> BETA_SHIFT;
+ return max(tcp_snd_cwnd(tp) - decr, 2U);
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 246ab7b5e857..3ec4edc37313 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -414,7 +414,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
per_mss = roundup_pow_of_two(per_mss) +
SKB_DATA_ALIGN(sizeof(struct sk_buff));
- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+ nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
/* Fast Recovery (RFC 5681 3.2) :
@@ -909,12 +909,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
* end of slow start and should slow down.
*/
- if (tp->snd_cwnd < tp->snd_ssthresh / 2)
+ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
else
rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
- rate *= max(tp->snd_cwnd, tp->packets_out);
+ rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
if (likely(tp->srtt_us))
do_div(rate, tp->srtt_us);
@@ -1660,6 +1660,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
(mss != tcp_skb_seglen(skb)))
goto out;
+ if (!tcp_skb_can_collapse(prev, skb))
+ goto out;
len = skb->len;
pcount = tcp_skb_pcount(skb);
if (tcp_skb_shift(prev, skb, pcount, len))
@@ -2145,12 +2147,12 @@ void tcp_enter_loss(struct sock *sk)
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->prior_cwnd = tp->snd_cwnd;
+ tp->prior_cwnd = tcp_snd_cwnd(tp);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
- tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
+ tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1);
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
@@ -2456,7 +2458,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
msg,
&inet->inet_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
+ tcp_snd_cwnd(tp), tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
@@ -2465,7 +2467,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
&sk->sk_v6_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
+ tcp_snd_cwnd(tp), tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
@@ -2490,7 +2492,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
+ tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2597,7 +2599,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tp->high_seq = tp->snd_nxt;
tp->tlp_high_seq = 0;
tp->snd_cwnd_cnt = 0;
- tp->prior_cwnd = tp->snd_cwnd;
+ tp->prior_cwnd = tcp_snd_cwnd(tp);
tp->prr_delivered = 0;
tp->prr_out = 0;
tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
@@ -2618,16 +2620,16 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost,
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) {
- sndcnt = min_t(int, delta,
- max_t(int, tp->prr_delivered - tp->prr_out,
- newly_acked_sacked) + 1);
} else {
- sndcnt = min(delta, newly_acked_sacked);
+ sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked);
+ if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
+ sndcnt++;
+ sndcnt = min(delta, sndcnt);
}
/* Force a fast retransmit upon entering fast recovery */
sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
- tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+ tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
}
static inline void tcp_end_cwnd_reduction(struct sock *sk)
@@ -2640,7 +2642,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
- tp->snd_cwnd = tp->snd_ssthresh;
+ tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
tp->snd_cwnd_stamp = tcp_jiffies32;
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
@@ -2704,12 +2706,15 @@ static void tcp_mtup_probe_success(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ u64 val;
- /* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = tp->snd_cwnd *
- tcp_mss_to_mtu(sk, tp->mss_cache) /
- icsk->icsk_mtup.probe_size;
+
+ val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
+ do_div(val, icsk->icsk_mtup.probe_size);
+ DEBUG_NET_WARN_ON_ONCE((u32)val != val);
+ tcp_snd_cwnd_set(tp, max_t(u32, 1U, val));
+
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
@@ -3032,7 +3037,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tcp_simple_retransmit(sk);
return;
}
@@ -3601,7 +3606,7 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
}
/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
+static void tcp_send_challenge_ack(struct sock *sk)
{
/* unprotected vars, we dont care of overwrites */
static u32 challenge_timestamp;
@@ -3763,8 +3768,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
- tcp_send_challenge_ack(sk, skb);
- return -1;
+ tcp_send_challenge_ack(sk);
+ return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
}
goto old_ack;
}
@@ -3773,7 +3778,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
- return -1;
+ return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
@@ -3865,7 +3870,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_process_tlp_ack(sk, ack, flag);
if (tcp_ack_is_dubious(sk, flag)) {
- if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
+ if (!(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
num_dupack = 1;
/* Consider if pure acks were aggregated in tcp_add_backlog() */
if (!(flag & FLAG_DATA))
@@ -4672,7 +4678,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
{
bool res = tcp_try_coalesce(sk, to, from, fragstolen);
- /* In case tcp_drop() is called later, update to->gso_segs */
+ /* In case tcp_drop_reason() is called later, update to->gso_segs */
if (res) {
u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
max_t(u16, 1, skb_shinfo(from)->gso_segs);
@@ -4682,10 +4688,11 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
return res;
}
-static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
{
sk_drops_add(sk, skb);
- __kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
/* This one checks to see if we can put data from the
@@ -4715,7 +4722,7 @@ static void tcp_ofo_queue(struct sock *sk)
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP);
continue;
}
@@ -4771,7 +4778,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
sk->sk_data_ready(sk);
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM);
return;
}
@@ -4834,7 +4841,8 @@ coalesce_done:
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb,
+ SKB_DROP_REASON_TCP_OFOMERGE);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4853,7 +4861,8 @@ coalesce_done:
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop_reason(sk, skb1,
+ SKB_DROP_REASON_TCP_OFOMERGE);
goto merge_right;
}
} else if (tcp_ooo_try_coalesce(sk, skb1,
@@ -4881,7 +4890,7 @@ merge_right:
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE);
}
/* If there is no skb after us, we are the last_skb ! */
if (!skb1)
@@ -4980,6 +4989,7 @@ void tcp_data_ready(struct sock *sk)
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ enum skb_drop_reason reason;
bool fragstolen;
int eaten;
@@ -4998,6 +5008,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
@@ -5006,6 +5017,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
+ reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
@@ -5015,6 +5027,7 @@ queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ reason = SKB_DROP_REASON_PROTO_MEM;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
sk->sk_data_ready(sk);
goto drop;
@@ -5051,6 +5064,7 @@ queue_and_out:
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
+ reason = SKB_DROP_REASON_TCP_OLD_DATA;
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -5058,13 +5072,16 @@ out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
return;
}
/* Out of window. F.e. zero window probe. */
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tp->rcv_nxt + tcp_receive_window(tp))) {
+ reason = SKB_DROP_REASON_TCP_OVERWINDOW;
goto out_of_window;
+ }
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
@@ -5074,6 +5091,7 @@ drop:
* remembering D-SACK for its head made in previous line.
*/
if (!tcp_receive_window(tp)) {
+ reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
@@ -5315,7 +5333,8 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
goal -= rb_to_skb(node)->truesize;
- tcp_drop(sk, rb_to_skb(node));
+ tcp_drop_reason(sk, rb_to_skb(node),
+ SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
if (!prev || goal <= 0) {
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -5417,7 +5436,7 @@ static bool tcp_should_expand_sndbuf(struct sock *sk)
return false;
/* If we filled the congestion window, do not expand. */
- if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
return false;
return true;
@@ -5435,7 +5454,17 @@ static void tcp_new_space(struct sock *sk)
INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}
-static void tcp_check_space(struct sock *sk)
+/* Caller made space either from:
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
+ *
+ * We might be able to generate EPOLLOUT to the application if:
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
+ * small enough that tcp_stream_memory_free() decides it
+ * is time to generate EPOLLOUT.
+ */
+void tcp_check_space(struct sock *sk)
{
/* pairs with tcp_poll() */
smp_mb();
@@ -5591,7 +5620,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
}
- tp->urg_data = TCP_URG_NOTYET;
+ WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
@@ -5604,11 +5633,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
struct tcp_sock *tp = tcp_sk(sk);
/* Check if we get a new urgent pointer - normally not. */
- if (th->urg)
+ if (unlikely(th->urg))
tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
- if (tp->urg_data == TCP_URG_NOTYET) {
+ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
th->syn;
@@ -5617,7 +5646,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
u8 tmp;
if (skb_copy_bits(skb, ptr, &tmp, 1))
BUG();
- tp->urg_data = TCP_URG_VALID | tmp;
+ WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
}
@@ -5648,7 +5677,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
- bool rst_seq_match = false;
+ SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
@@ -5660,6 +5689,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
+ SKB_DR_SET(reason, TCP_RFC7323_PAWS);
goto discard;
}
/* Reset is accepted even if it did not pass PAWS. */
@@ -5681,8 +5711,9 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
- tcp_reset(sk, skb);
+ goto reset;
}
+ SKB_DR_SET(reason, TCP_INVALID_SEQUENCE);
goto discard;
}
@@ -5698,9 +5729,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* Send a challenge ACK
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
- tcp_reset_check(sk, skb)) {
- rst_seq_match = true;
- } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
+ tcp_reset_check(sk, skb))
+ goto reset;
+
+ if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
int max_sack = sp[0].end_seq;
int this_sack;
@@ -5713,21 +5745,18 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
}
if (TCP_SKB_CB(skb)->seq == max_sack)
- rst_seq_match = true;
+ goto reset;
}
- if (rst_seq_match)
- tcp_reset(sk, skb);
- else {
- /* Disable TFO if RST is out-of-order
- * and no data has been received
- * for current active TFO socket
- */
- if (tp->syn_fastopen && !tp->data_segs_in &&
- sk->sk_state == TCP_ESTABLISHED)
- tcp_fastopen_active_disable(sk);
- tcp_send_challenge_ack(sk, skb);
- }
+ /* Disable TFO if RST is out-of-order
+ * and no data has been received
+ * for current active TFO socket
+ */
+ if (tp->syn_fastopen && !tp->data_segs_in &&
+ sk->sk_state == TCP_ESTABLISHED)
+ tcp_fastopen_active_disable(sk);
+ tcp_send_challenge_ack(sk);
+ SKB_DR_SET(reason, TCP_RESET);
goto discard;
}
@@ -5741,7 +5770,8 @@ syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
- tcp_send_challenge_ack(sk, skb);
+ tcp_send_challenge_ack(sk);
+ SKB_DR_SET(reason, TCP_INVALID_SYN);
goto discard;
}
@@ -5750,7 +5780,12 @@ syn_challenge:
return true;
discard:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
+ return false;
+
+reset:
+ tcp_reset(sk, skb);
+ __kfree_skb(skb);
return false;
}
@@ -5779,6 +5814,7 @@ discard:
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int len = skb->len;
@@ -5787,7 +5823,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
trace_tcp_probe(sk, skb);
tcp_mstamp_refresh(tp);
- if (unlikely(!sk->sk_rx_dst))
+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
@@ -5867,6 +5903,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
return;
} else { /* Header too small */
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
@@ -5894,6 +5931,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
+ skb_dst_drop(skb);
__skb_pull(skb, tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
@@ -5922,8 +5960,10 @@ slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
- if (!th->ack && !th->rst && !th->syn)
+ if (!th->ack && !th->rst && !th->syn) {
+ reason = SKB_DROP_REASON_TCP_FLAGS;
goto discard;
+ }
/*
* Standard slow path.
@@ -5933,9 +5973,11 @@ slow_path:
return;
step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
+ reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
+ if ((int)reason < 0) {
+ reason = -reason;
goto discard;
-
+ }
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
@@ -5949,12 +5991,13 @@ step5:
return;
csum_error:
+ reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -5974,9 +6017,9 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
* retransmission has occurred.
*/
if (tp->total_retrans > 1 && tp->undo_marker)
- tp->snd_cwnd = 1;
+ tcp_snd_cwnd_set(tp, 1);
else
- tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk)));
tp->snd_cwnd_stamp = tcp_jiffies32;
bpf_skops_established(sk, bpf_op, skb);
@@ -6112,6 +6155,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;
+ SKB_DR(reason);
tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
@@ -6154,7 +6198,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (th->rst) {
tcp_reset(sk, skb);
- goto discard;
+consume:
+ __kfree_skb(skb);
+ return 0;
}
/* rfc793:
@@ -6164,9 +6210,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* See note below!
* --ANK(990513)
*/
- if (!th->syn)
+ if (!th->syn) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard_and_undo;
-
+ }
/* rfc793:
* "If the SYN bit is on ...
* are acceptable then ...
@@ -6243,13 +6290,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
-
-discard:
- tcp_drop(sk, skb);
- return 0;
- } else {
- tcp_send_ack(sk);
+ goto consume;
}
+ tcp_send_ack(sk);
return -1;
}
@@ -6261,15 +6304,16 @@ discard:
*
* Otherwise (no ACK) drop the segment and return."
*/
-
+ SKB_DR_SET(reason, TCP_RESET);
goto discard_and_undo;
}
/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
- tcp_paws_reject(&tp->rx_opt, 0))
+ tcp_paws_reject(&tp->rx_opt, 0)) {
+ SKB_DR_SET(reason, TCP_RFC7323_PAWS);
goto discard_and_undo;
-
+ }
if (th->syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
@@ -6318,7 +6362,7 @@ discard:
*/
return -1;
#else
- goto discard;
+ goto consume;
#endif
}
/* "fifth, if neither of the SYN or RST bits is set then
@@ -6328,7 +6372,8 @@ discard:
discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- goto discard;
+ tcp_drop_reason(sk, skb, reason);
+ return 0;
reset_and_undo:
tcp_clear_options(&tp->rx_opt);
@@ -6383,21 +6428,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
struct request_sock *req;
int queued = 0;
bool acceptable;
+ SKB_DR(reason);
switch (sk->sk_state) {
case TCP_CLOSE:
+ SKB_DR_SET(reason, TCP_CLOSE);
goto discard;
case TCP_LISTEN:
if (th->ack)
return 1;
- if (th->rst)
+ if (th->rst) {
+ SKB_DR_SET(reason, TCP_RESET);
goto discard;
-
+ }
if (th->syn) {
- if (th->fin)
+ if (th->fin) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
+ }
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
@@ -6412,6 +6462,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
consume_skb(skb);
return 0;
}
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
case TCP_SYN_SENT:
@@ -6438,13 +6489,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (!tcp_check_req(sk, skb, req, true, &req_stolen))
+ if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
+ SKB_DR_SET(reason, TCP_FASTOPEN);
goto discard;
+ }
}
- if (!th->ack && !th->rst && !th->syn)
+ if (!th->ack && !th->rst && !th->syn) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
-
+ }
if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
@@ -6456,7 +6510,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!acceptable) {
if (sk->sk_state == TCP_SYN_RECV)
return 1; /* send one RST */
- tcp_send_challenge_ack(sk, skb);
+ tcp_send_challenge_ack(sk);
+ SKB_DR_SET(reason, TCP_OLD_ACK);
goto discard;
}
switch (sk->sk_state) {
@@ -6550,7 +6605,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
+ goto consume;
}
break;
}
@@ -6558,7 +6613,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
+ goto consume;
}
break;
@@ -6566,7 +6621,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
- goto discard;
+ goto consume;
}
break;
}
@@ -6617,9 +6672,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
}
return 0;
+
+consume:
+ __kfree_skb(skb);
+ return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
@@ -6670,7 +6729,7 @@ static void tcp_ecn_create_request(struct request_sock *req,
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
- ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
+ ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
@@ -6701,7 +6760,8 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
- ireq->smc_ok = rx_opt->smc_ok;
+ ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+ tcp_sk(sk)->smc_hs_congested(sk));
#endif
}
@@ -6723,6 +6783,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
+ req->timeout = TCP_TIMEOUT_INIT;
}
return req;
@@ -6939,9 +7000,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
- if (!want_cookie)
- inet_csk_reqsk_queue_hash_add(sk, req,
- tcp_timeout_init((struct sock *)req));
+ if (!want_cookie) {
+ req->timeout = tcp_timeout_init((struct sock *)req);
+ inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+ }
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13d868c43284..da5a3c44c4fb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -91,6 +91,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
+static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
return secure_tcp_seq(ip_hdr(skb)->daddr,
@@ -206,7 +208,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct rtable *rt;
int err;
struct ip_options_rcu *inet_opt;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+ struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -227,9 +229,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_TCP,
- orig_sport, orig_dport, sk);
+ sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
+ orig_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
@@ -810,7 +811,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
@@ -825,6 +827,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
transmit_time);
ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
@@ -908,7 +911,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -921,6 +925,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
transmit_time);
ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
}
@@ -1182,7 +1187,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
if (!md5sig)
return -ENOMEM;
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(sk);
INIT_HLIST_HEAD(&md5sig->head);
rcu_assign_pointer(tp->md5sig_info, md5sig);
}
@@ -1202,8 +1207,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key->l3index = l3index;
key->flags = flags;
memcpy(&key->addr, addr,
- (family == AF_INET6) ? sizeof(struct in6_addr) :
- sizeof(struct in_addr));
+ (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
hlist_add_head_rcu(&key->node, &md5sig->head);
return 0;
}
@@ -1403,72 +1408,6 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
#endif
-/* Called with rcu_read_lock() */
-static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb,
- int dif, int sdif)
-{
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * This gets called for each TCP segment that arrives
- * so we want to be efficient.
- * We have 3 drop cases:
- * o No MD5 hash and one expected.
- * o MD5 hash and we're not expecting one.
- * o MD5 hash and its wrong.
- */
- const __u8 *hash_location = NULL;
- struct tcp_md5sig_key *hash_expected;
- const struct iphdr *iph = ip_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
- const union tcp_md5_addr *addr;
- unsigned char newhash[16];
- int genhash, l3index;
-
- /* sdif set, means packet ingressed via a device
- * in an L3 domain and dif is set to the l3mdev
- */
- l3index = sdif ? dif : 0;
-
- addr = (union tcp_md5_addr *)&iph->saddr;
- hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
- hash_location = tcp_parse_md5sig_option(th);
-
- /* We've parsed the options - do we have a hash? */
- if (!hash_expected && !hash_location)
- return false;
-
- if (hash_expected && !hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return true;
- }
-
- if (!hash_expected && hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return true;
- }
-
- /* Okay, so this is hash_expected and hash_location -
- * so we need to calculate the checksum.
- */
- genhash = tcp_v4_md5_hash_skb(newhash,
- hash_expected,
- NULL, skb);
-
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
- &iph->saddr, ntohs(th->source),
- &iph->daddr, ntohs(th->dest),
- genhash ? " tcp_v4_calc_md5_hash failed"
- : "", l3index);
- return true;
- }
- return false;
-#endif
- return false;
-}
-
static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -1620,7 +1559,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
*/
tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
key->key, key->keylen, GFP_ATOMIC);
- sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+ sk_gso_disable(newsk);
}
#endif
@@ -1698,10 +1637,14 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
@@ -1709,14 +1652,15 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
dst, 0)) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
- sk->sk_rx_dst = NULL;
}
}
tcp_rcv_established(sk, skb);
return 0;
}
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (tcp_checksum_complete(skb))
goto csum_err;
@@ -1744,7 +1688,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
reset:
tcp_v4_send_reset(rsk, skb);
discard:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
@@ -1753,6 +1697,7 @@ discard:
return 0;
csum_err:
+ reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
@@ -1786,7 +1731,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
@@ -1798,10 +1743,10 @@ int tcp_v4_early_demux(struct sk_buff *skb)
return 0;
}
-bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
{
- u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
- u32 tail_gso_size, tail_gso_segs;
+ u32 limit, tail_gso_size, tail_gso_segs;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -1825,6 +1770,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
if (unlikely(tcp_checksum_complete(skb))) {
bh_unlock_sock(sk);
trace_tcp_bad_csum(skb);
+ *reason = SKB_DROP_REASON_TCP_CSUM;
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
return true;
@@ -1909,10 +1855,11 @@ no_coalesce:
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
- limit += 64*1024;
+ limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
+ *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
return true;
}
@@ -1963,6 +1910,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ enum skb_drop_reason drop_reason;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
@@ -1971,6 +1919,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
struct sock *sk;
int ret;
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1982,8 +1931,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
- if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
+ }
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
@@ -2013,7 +1964,13 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+ else
+ drop_reason = tcp_inbound_md5_hash(sk, skb,
+ &iph->saddr, &iph->daddr,
+ AF_INET, dif, sdif);
+ if (unlikely(drop_reason)) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
@@ -2045,6 +2002,8 @@ process:
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
+ } else {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
}
if (!nsk) {
reqsk_put(req);
@@ -2060,6 +2019,7 @@ process:
}
goto discard_and_relse;
}
+ nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
@@ -2080,16 +2040,22 @@ process:
}
}
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
+ }
- if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
+ drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
+ &iph->daddr, AF_INET, dif, sdif);
+ if (drop_reason)
goto discard_and_relse;
nf_reset_ct(skb);
- if (tcp_filter(sk, skb))
+ if (tcp_filter(sk, skb)) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto discard_and_relse;
+ }
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
@@ -2109,7 +2075,7 @@ process:
if (!sock_owned_by_user(sk)) {
ret = tcp_v4_do_rcv(sk, skb);
} else {
- if (tcp_add_backlog(sk, skb))
+ if (tcp_add_backlog(sk, skb, &drop_reason))
goto discard_and_relse;
}
bh_unlock_sock(sk);
@@ -2121,6 +2087,7 @@ put_and_return:
return ret;
no_tcp_socket:
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
@@ -2128,6 +2095,7 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
+ drop_reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
@@ -2137,8 +2105,9 @@ bad_packet:
}
discard_it:
+ SKB_DR_OR(drop_reason, NOT_SPECIFIED);
/* Discard frame. */
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return 0;
discard_and_relse:
@@ -2149,6 +2118,7 @@ discard_and_relse:
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
@@ -2201,7 +2171,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
if (dst && dst_hold_safe(dst)) {
- sk->sk_rx_dst = dst;
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
@@ -2318,16 +2288,15 @@ static void *listening_get_first(struct seq_file *seq)
st->offset = 0;
for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
struct inet_listen_hashbucket *ilb2;
- struct inet_connection_sock *icsk;
+ struct hlist_nulls_node *node;
struct sock *sk;
ilb2 = &tcp_hashinfo.lhash2[st->bucket];
- if (hlist_empty(&ilb2->head))
+ if (hlist_nulls_empty(&ilb2->nulls_head))
continue;
spin_lock(&ilb2->lock);
- inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
- sk = (struct sock *)icsk;
+ sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
if (seq_sk_match(seq, sk))
return sk;
}
@@ -2346,15 +2315,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct tcp_iter_state *st = seq->private;
struct inet_listen_hashbucket *ilb2;
- struct inet_connection_sock *icsk;
+ struct hlist_nulls_node *node;
struct sock *sk = cur;
++st->num;
++st->offset;
- icsk = inet_csk(sk);
- inet_lhash2_for_each_icsk_continue(icsk) {
- sk = (struct sock *)icsk;
+ sk = sk_nulls_next(sk);
+ sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk))
return sk;
}
@@ -2654,7 +2622,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
- tp->snd_cwnd,
+ tcp_snd_cwnd(tp),
state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
@@ -2763,16 +2731,15 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
{
struct bpf_tcp_iter_state *iter = seq->private;
struct tcp_iter_state *st = &iter->state;
- struct inet_connection_sock *icsk;
+ struct hlist_nulls_node *node;
unsigned int expected = 1;
struct sock *sk;
sock_hold(start_sk);
iter->batch[iter->end_sk++] = start_sk;
- icsk = inet_csk(start_sk);
- inet_lhash2_for_each_icsk_continue(icsk) {
- sk = (struct sock *)icsk;
+ sk = sk_nulls_next(start_sk);
+ sk_nulls_for_each_from(sk, node) {
if (seq_sk_match(seq, sk)) {
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
@@ -2991,7 +2958,7 @@ static unsigned short seq_file_family(const struct seq_file *seq)
#endif
/* Iterated from proc fs */
- afinfo = PDE_DATA(file_inode(seq->file));
+ afinfo = pde_data(file_inode(seq->file));
return afinfo->family;
}
@@ -3073,6 +3040,7 @@ struct proto tcp_prot = {
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = tcp_bpf_update_proto,
#endif
@@ -3099,41 +3067,18 @@ EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
- int cpu;
+ struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
if (net->ipv4.tcp_congestion_control)
bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner);
-
- for_each_possible_cpu(cpu)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
- free_percpu(net->ipv4.tcp_sk);
+ if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
+ kfree(tcp_death_row);
}
static int __net_init tcp_sk_init(struct net *net)
{
- int res, cpu, cnt;
-
- net->ipv4.tcp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.tcp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(cpu) {
- struct sock *sk;
-
- res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
- IPPROTO_TCP, net);
- if (res)
- goto fail;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-
- /* Please enforce IP_DF and IPID==0 for RST and
- * ACK sent in SYN-RECV and TIME-WAIT state.
- */
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
-
- *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
- }
+ int cnt;
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
@@ -3160,9 +3105,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
+ net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
+ if (!net->ipv4.tcp_death_row)
+ return -ENOMEM;
+ refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
- net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+ net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
+ net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
@@ -3188,6 +3137,7 @@ static int __net_init tcp_sk_init(struct net *net)
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
+ net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
net->ipv4.sysctl_tcp_autocorking = 1;
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
@@ -3217,10 +3167,6 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_congestion_control = &tcp_reno;
return 0;
-fail:
- tcp_sk_exit(net);
-
- return res;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
@@ -3314,6 +3260,24 @@ static void __init bpf_iter_register(void)
void __init tcp_v4_init(void)
{
+ int cpu, res;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, &init_net);
+ if (res)
+ panic("Failed to create the TCP control socket.\n");
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Please enforce IP_DF and IPID==0 for RST and
+ * ACK sent in SYN-RECV and TIME-WAIT state.
+ */
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+ per_cpu(ipv4_tcp_sk, cpu) = sk;
+ }
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 82b36ec3f2f8..ae36780977d2 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -297,7 +297,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
lp->flag &= ~LP_WITHIN_THR;
pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
- tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+ tcp_snd_cwnd(tp), lp->remote_hz, lp->owd_min, lp->owd_max,
lp->sowd >> 3);
if (lp->flag & LP_WITHIN_THR)
@@ -313,12 +313,12 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
/* happened within inference
* drop snd_cwnd into 1 */
if (lp->flag & LP_WITHIN_INF)
- tp->snd_cwnd = 1U;
+ tcp_snd_cwnd_set(tp, 1U);
/* happened after inference
* cut snd_cwnd into half */
else
- tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp) >> 1U, 1U));
/* record this drop time */
lp->last_drop = now;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0588b004ddac..7029b0e98edb 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -388,15 +388,15 @@ void tcp_update_metrics(struct sock *sk)
if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
- if (val && (tp->snd_cwnd >> 1) > val)
+ if (val && (tcp_snd_cwnd(tp) >> 1) > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
- tp->snd_cwnd >> 1);
+ tcp_snd_cwnd(tp) >> 1);
}
if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND);
- if (tp->snd_cwnd > val)
+ if (tcp_snd_cwnd(tp) > val)
tcp_metric_set(tm, TCP_METRIC_CWND,
- tp->snd_cwnd);
+ tcp_snd_cwnd(tp));
}
} else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
@@ -404,10 +404,10 @@ void tcp_update_metrics(struct sock *sk)
if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND);
- tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
}
} else {
/* Else slow start did not finish, cwnd is non-sense,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c2d3ac2363a..6854bb1fb32b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -248,7 +248,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
struct inet_timewait_sock *tw;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+ struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
tw = inet_twsk_alloc(sk, tcp_death_row, state);
@@ -531,7 +531,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
- if (newtp->af_specific->md5_lookup(sk, newsk))
+ if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req)))
newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
@@ -583,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -622,8 +622,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
!inet_rtx_syn_ack(sk, req)) {
unsigned long expires = jiffies;
- expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
- TCP_RTO_MAX);
+ expires += reqsk_timeout(req, TCP_RTO_MAX);
if (!fastopen)
mod_timer_pending(&req->rsk_timer, expires);
else
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index ab552356bdba..a60662f4bdf9 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -197,10 +197,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
if (ca->cwnd_growth_factor < 0) {
- cnt = tp->snd_cwnd << -ca->cwnd_growth_factor;
+ cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor;
tcp_cong_avoid_ai(tp, cnt, acked);
} else {
- cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor);
+ cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor);
tcp_cong_avoid_ai(tp, cnt, acked);
}
}
@@ -209,7 +209,7 @@ static u32 tcpnv_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
+ return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U);
}
static void tcpnv_state(struct sock *sk, u8 new_state)
@@ -257,7 +257,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
return;
/* Stop cwnd growth if we were in catch up mode */
- if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) {
+ if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) {
ca->nv_catchup = 0;
ca->nv_allow_cwnd_growth = 0;
}
@@ -371,7 +371,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
* if cwnd < max_win, grow cwnd
* else leave the same
*/
- if (tp->snd_cwnd > max_win) {
+ if (tcp_snd_cwnd(tp) > max_win) {
/* there is congestion, check that it is ok
* to make a CA decision
* 1. We should have at least nv_dec_eval_min_calls
@@ -398,20 +398,20 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
ca->nv_allow_cwnd_growth = 0;
tp->snd_ssthresh =
(nv_ssthresh_factor * max_win) >> 3;
- if (tp->snd_cwnd - max_win > 2) {
+ if (tcp_snd_cwnd(tp) - max_win > 2) {
/* gap > 2, we do exponential cwnd decrease */
int dec;
- dec = max(2U, ((tp->snd_cwnd - max_win) *
+ dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) *
nv_cong_dec_mult) >> 7);
- tp->snd_cwnd -= dec;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec);
} else if (nv_cong_dec_mult > 0) {
- tp->snd_cwnd = max_win;
+ tcp_snd_cwnd_set(tp, max_win);
}
if (ca->cwnd_growth_factor > 0)
ca->cwnd_growth_factor = 0;
ca->nv_no_cong_cnt = 0;
- } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) {
+ } else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) {
/* There is no congestion, grow cwnd if allowed*/
if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
return;
@@ -444,8 +444,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
* (it wasn't before, if it is now is because nv
* decreased it).
*/
- if (tp->snd_cwnd < nv_min_cwnd)
- tp->snd_cwnd = nv_min_cwnd;
+ if (tcp_snd_cwnd(tp) < nv_min_cwnd)
+ tcp_snd_cwnd_set(tp, nv_min_cwnd);
}
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index fc61cd3fea65..30abde86db45 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -8,6 +8,7 @@
#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/gro.h>
#include <net/tcp.h>
#include <net/protocol.h>
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2e6e5a70168e..11aa0ab10bba 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -82,6 +82,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
+ tcp_check_space(sk);
}
/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
@@ -142,7 +143,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
- u32 cwnd = tp->snd_cwnd;
+ u32 cwnd = tcp_snd_cwnd(tp);
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -151,7 +152,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
- tp->snd_cwnd = max(cwnd, restart_cwnd);
+ tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}
@@ -323,7 +324,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
- bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
if (!use_ecn) {
@@ -345,7 +346,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
/* tp->ecn_flags are cleared at a later point in time when
* SYN ACK is ultimatively being received.
*/
@@ -444,12 +445,13 @@ struct tcp_out_options {
struct mptcp_out_options mptcp;
};
-static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
+static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
+ struct tcp_sock *tp,
struct tcp_out_options *opts)
{
#if IS_ENABLED(CONFIG_MPTCP)
if (unlikely(OPTION_MPTCP & opts->options))
- mptcp_write_options(ptr, tp, &opts->mptcp);
+ mptcp_write_options(th, ptr, tp, &opts->mptcp);
#endif
}
@@ -605,9 +607,10 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
* At least SACK_PERM as the first option is known to lead to a disaster
* (but it may well be that other scenarios fail similarly).
*/
-static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
struct tcp_out_options *opts)
{
+ __be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */
if (unlikely(OPTION_MD5 & options)) {
@@ -701,7 +704,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
smc_options_write(ptr, &options);
- mptcp_options_write(ptr, tp, opts);
+ mptcp_options_write(th, ptr, tp, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
@@ -1013,7 +1016,7 @@ static void tcp_tsq_write(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tp->lost_out > tp->retrans_out &&
- tp->snd_cwnd > tcp_packets_in_flight(tp)) {
+ tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
tcp_mstamp_refresh(tp);
tcp_xmit_retransmit_queue(sk);
}
@@ -1253,7 +1256,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
if (clone_it) {
oskb = skb;
@@ -1354,12 +1357,12 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
th->window = htons(min(tp->rcv_wnd, 65535U));
}
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ tcp_options_write(th, tp, &opts);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(sk);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, skb);
}
@@ -1550,7 +1553,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
* SO_SNDBUF values.
* Also allow first and last skb in retransmit queue to be split.
*/
- limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
+ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
skb != tcp_rtx_queue_head(sk) &&
@@ -1589,7 +1592,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
skb_split(skb, buff, len);
- buff->tstamp = skb->tstamp;
+ skb_set_delivery_time(buff, skb->tstamp, true);
tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
@@ -1860,9 +1863,9 @@ static void tcp_cwnd_application_limited(struct sock *sk)
/* Limited by application or receiver window. */
u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 win_used = max(tp->snd_cwnd_used, init_win);
- if (win_used < tp->snd_cwnd) {
+ if (win_used < tcp_snd_cwnd(tp)) {
tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
}
tp->snd_cwnd_used = 0;
}
@@ -1951,25 +1954,34 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
}
/* Return how many segs we'd like on a TSO packet,
- * to send one TSO packet per ms
+ * depending on current pacing rate, and how close the peer is.
+ *
+ * Rationale is:
+ * - For close peers, we rather send bigger packets to reduce
+ * cpu costs, because occasional losses will be repaired fast.
+ * - For long distance/rtt flows, we would like to get ACK clocking
+ * with 1 ACK per ms.
+ *
+ * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
*/
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs)
{
- u32 bytes, segs;
+ unsigned long bytes;
+ u32 r;
- bytes = min_t(unsigned long,
- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
- /* Goal is to send at least one packet per ms,
- * not one big TSO packet every 100 ms.
- * This preserves ACK clocking and is consistent
- * with tcp_tso_should_defer() heuristic.
- */
- segs = max_t(u32, bytes / mss_now, min_tso_segs);
+ r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log;
+ if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
+ bytes += sk->sk_gso_max_size >> r;
- return segs;
+ bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
+
+ return max_t(u32, bytes / mss_now, min_tso_segs);
}
/* Return the number of segments we want in the skb we are transmitting.
@@ -2034,7 +2046,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
return 1;
in_flight = tcp_packets_in_flight(tp);
- cwnd = tp->snd_cwnd;
+ cwnd = tcp_snd_cwnd(tp);
if (in_flight >= cwnd)
return 0;
@@ -2187,12 +2199,12 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
in_flight = tcp_packets_in_flight(tp);
BUG_ON(tcp_skb_pcount(skb) <= 1);
- BUG_ON(tp->snd_cwnd <= in_flight);
+ BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* From in_flight test above, we know that cwnd > in_flight. */
- cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
limit = min(send_win, cong_win);
@@ -2206,7 +2218,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
- u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+ u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
/* If at least some fraction of a window is available,
* just use it.
@@ -2336,7 +2348,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (likely(!icsk->icsk_mtup.enabled ||
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
+ tcp_snd_cwnd(tp) < 11 ||
tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
@@ -2372,7 +2384,7 @@ static int tcp_mtu_probe(struct sock *sk)
return 0;
/* Do we need to wait to drain cwnd? With none in flight, don't stall */
- if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
+ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
if (!tcp_packets_in_flight(tp))
return -1;
else
@@ -2441,7 +2453,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
/* Decrement cwnd here because we are sending
* effectively two packets. */
- tp->snd_cwnd--;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
tcp_event_new_data_sent(sk, nskb);
icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
@@ -2616,7 +2628,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
- skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
@@ -2698,7 +2711,7 @@ repair:
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
- is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
if (likely(sent_pkts || is_cwnd_limited))
tcp_cwnd_validate(sk, is_cwnd_limited);
@@ -2808,7 +2821,7 @@ void tcp_send_loss_probe(struct sock *sk)
if (unlikely(!skb)) {
WARN_ONCE(tp->packets_out,
"invalid inflight: %u state %u cwnd %u mss %d\n",
- tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
+ tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
inet_csk(sk)->icsk_pending = 0;
return;
}
@@ -3292,7 +3305,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!hole)
tp->retransmit_skb_hint = skb;
- segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
+ segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
if (segs <= 0)
break;
sacked = TCP_SKB_CB(skb)->sacked;
@@ -3541,11 +3554,12 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
- skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
+ skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
+ true);
else
#endif
{
- skb->skb_mstamp_ns = now;
+ skb_set_delivery_time(skb, now, true);
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
@@ -3579,7 +3593,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), NULL, &opts);
+ tcp_options_write(th, NULL, &opts);
th->doff = (tcp_header_size >> 2);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
@@ -3594,7 +3608,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
synack_type, &opts);
- skb->skb_mstamp_ns = now;
+ skb_set_delivery_time(skb, now, true);
tcp_add_tx_delay(skb, tp);
return skb;
@@ -3719,6 +3733,7 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
*/
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
int space, err = 0;
@@ -3733,8 +3748,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
* private TCP options. The cost is reduced data space in SYN :(
*/
tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+ /* Sync mss_cache after updating the mss_clamp */
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
space = min_t(size_t, space, fo->size);
@@ -3771,7 +3788,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
+ skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -4092,12 +4109,14 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl;
int res;
- tcp_rsk(req)->txhash = net_tx_rndhash();
+ /* Paired with WRITE_ONCE() in sock_setsockopt() */
+ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+ tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL);
if (!res) {
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
if (unlikely(tcp_passive_fastopen(sk)))
tcp_sk(sk)->total_retrans++;
trace_tcp_retransmit_synack(sk, req);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index fbab921670cc..a8f6d9d06f2e 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -74,27 +74,32 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
*
* If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
* called multiple times. We favor the information from the most recently
- * sent skb, i.e., the skb with the highest prior_delivered count.
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
*/
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ u64 tx_tstamp;
if (!scb->tx.delivered_mstamp)
return;
+ tx_tstamp = tcp_skb_timestamp_us(skb);
if (!rs->prior_delivered ||
- after(scb->tx.delivered, rs->prior_delivered)) {
+ tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ scb->end_seq, rs->last_end_seq)) {
rs->prior_delivered_ce = scb->tx.delivered_ce;
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->last_end_seq = scb->end_seq;
/* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
+ tp->first_tx_mstamp = tx_tstamp;
/* Find the duration of the "send phase" of this window: */
rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
scb->tx.first_tx_mstamp);
@@ -195,7 +200,7 @@ void tcp_rate_check_app_limited(struct sock *sk)
/* Nothing in sending host's qdisc queues or NIC tx queue. */
sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
/* We are not limited by CWND. */
- tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
/* All lost packets have been retransmitted. */
tp->lost_out <= tp->retrans_out)
tp->app_limited =
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index fd113f6226ef..48f30e7209f2 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,11 +2,6 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
-{
- return t1 > t2 || (t1 == t2 && after(seq1, seq2));
-}
-
static u32 tcp_rack_reo_wnd(const struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -77,9 +72,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (!tcp_rack_sent_after(tp->rack.mstamp,
- tcp_skb_timestamp_us(skb),
- tp->rack.end_seq, scb->end_seq))
+ if (!tcp_skb_sent_after(tp->rack.mstamp,
+ tcp_skb_timestamp_us(skb),
+ tp->rack.end_seq, scb->end_seq))
break;
/* A packet is lost if it has not been s/acked beyond
@@ -140,8 +135,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
}
tp->rack.advanced = 1;
tp->rack.rtt_us = rtt_us;
- if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
- end_seq, tp->rack.end_seq)) {
+ if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
tp->rack.mstamp = xmit_time;
tp->rack.end_seq = end_seq;
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 5842081bc8a2..862b96248a92 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -27,7 +27,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!acked)
return;
}
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
acked);
}
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+ return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U);
}
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c8003c8aad2c..786848ad37ea 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -159,7 +159,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
{
- return min(tp->snd_ssthresh, tp->snd_cwnd);
+ return min(tp->snd_ssthresh, tcp_snd_cwnd(tp));
}
static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -217,14 +217,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* This is:
* (actual rate in segments) * baseRTT
*/
- target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT;
do_div(target_cwnd, rtt);
/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
* is the "Diff" from the Arizona Vegas papers.
*/
- diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
+ diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT;
if (diff > gamma && tcp_in_slow_start(tp)) {
/* Going too fast. Time to slow down
@@ -238,7 +238,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* truncation robs us of full link
* utilization.
*/
- tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp),
+ (u32)target_cwnd + 1));
tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
} else if (tcp_in_slow_start(tp)) {
@@ -254,14 +255,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* The old window was too fast, so
* we slow down.
*/
- tp->snd_cwnd--;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
tp->snd_ssthresh
= tcp_vegas_ssthresh(tp);
} else if (diff < alpha) {
/* We don't have enough extra packets
* in the network, so speed up.
*/
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
} else {
/* Sending just as fast as we
* should be.
@@ -269,10 +270,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
}
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
- tp->snd_cwnd = tp->snd_cwnd_clamp;
+ if (tcp_snd_cwnd(tp) < 2)
+ tcp_snd_cwnd_set(tp, 2);
+ else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
tp->snd_ssthresh = tcp_current_ssthresh(sk);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index cd50a61c9976..366ff6f214b2 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -146,11 +146,11 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rtt = veno->minrtt;
- target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
+ target_cwnd = (u64)tcp_snd_cwnd(tp) * veno->basertt;
target_cwnd <<= V_PARAM_SHIFT;
do_div(target_cwnd, rtt);
- veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+ veno->diff = (tcp_snd_cwnd(tp) << V_PARAM_SHIFT) - target_cwnd;
if (tcp_in_slow_start(tp)) {
/* Slow start. */
@@ -164,15 +164,15 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* In the "non-congestive state", increase cwnd
* every rtt.
*/
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
} else {
/* In the "congestive state", increase cwnd
* every other rtt.
*/
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
if (veno->inc &&
- tp->snd_cwnd < tp->snd_cwnd_clamp) {
- tp->snd_cwnd++;
+ tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
veno->inc = 0;
} else
veno->inc = 1;
@@ -181,10 +181,10 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tp->snd_cwnd_cnt += acked;
}
done:
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
- tp->snd_cwnd = tp->snd_cwnd_clamp;
+ if (tcp_snd_cwnd(tp) < 2)
+ tcp_snd_cwnd_set(tp, 2);
+ else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
}
/* Wipe the slate clean for the next rtt. */
/* veno->cntrtt = 0; */
@@ -199,10 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
- return max(tp->snd_cwnd * 4 / 5, 2U);
+ return max(tcp_snd_cwnd(tp) * 4 / 5, 2U);
else
/* in "congestive state", cut cwnd by 1/2 */
- return max(tp->snd_cwnd >> 1U, 2U);
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
static struct tcp_congestion_ops tcp_veno __read_mostly = {
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index b2e05c4cea00..c6e97141eef2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -244,7 +244,8 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
switch (event) {
case CA_EVENT_COMPLETE_CWR:
- tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
break;
case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 07c4c93b9fdb..18b07ff5d20e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -71,11 +71,11 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!yeah->doing_reno_now) {
/* Scalable */
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
acked);
} else {
/* Reno */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
@@ -130,7 +130,7 @@ do_vegas:
/* Compute excess number of packets above bandwidth
* Avoid doing full 64 bit divide.
*/
- bw = tp->snd_cwnd;
+ bw = tcp_snd_cwnd(tp);
bw *= rtt - yeah->vegas.baseRTT;
do_div(bw, rtt);
queue = bw;
@@ -138,20 +138,20 @@ do_vegas:
if (queue > TCP_YEAH_ALPHA ||
rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
if (queue > TCP_YEAH_ALPHA &&
- tp->snd_cwnd > yeah->reno_count) {
+ tcp_snd_cwnd(tp) > yeah->reno_count) {
u32 reduction = min(queue / TCP_YEAH_GAMMA ,
- tp->snd_cwnd >> TCP_YEAH_EPSILON);
+ tcp_snd_cwnd(tp) >> TCP_YEAH_EPSILON);
- tp->snd_cwnd -= reduction;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - reduction);
- tp->snd_cwnd = max(tp->snd_cwnd,
- yeah->reno_count);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp),
+ yeah->reno_count));
- tp->snd_ssthresh = tp->snd_cwnd;
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
if (yeah->reno_count <= 2)
- yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
+ yeah->reno_count = max(tcp_snd_cwnd(tp)>>1, 2U);
else
yeah->reno_count++;
@@ -176,7 +176,7 @@ do_vegas:
*/
yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
yeah->vegas.beg_snd_nxt = tp->snd_nxt;
- yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
+ yeah->vegas.beg_snd_cwnd = tcp_snd_cwnd(tp);
/* Wipe the slate clean for the next RTT. */
yeah->vegas.cntRTT = 0;
@@ -193,16 +193,16 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
if (yeah->doing_reno_now < TCP_YEAH_RHO) {
reduction = yeah->lastQ;
- reduction = min(reduction, max(tp->snd_cwnd>>1, 2U));
+ reduction = min(reduction, max(tcp_snd_cwnd(tp)>>1, 2U));
- reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+ reduction = max(reduction, tcp_snd_cwnd(tp) >> TCP_YEAH_DELTA);
} else
- reduction = max(tp->snd_cwnd>>1, 2U);
+ reduction = max(tcp_snd_cwnd(tp)>>1, 2U);
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- return max_t(int, tp->snd_cwnd - reduction, 2);
+ return max_t(int, tcp_snd_cwnd(tp) - reduction, 2);
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 23b05e28490b..aa9f2ec3dc46 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -74,6 +74,7 @@
#define pr_fmt(fmt) "UDP: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/memblock.h>
@@ -122,7 +123,7 @@ EXPORT_SYMBOL(udp_table);
long sysctl_udp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_udp_mem);
-atomic_long_t udp_memory_allocated;
+atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
@@ -459,7 +460,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
struct udp_table *udptable,
struct sk_buff *skb,
__be32 saddr, __be16 sport,
- __be32 daddr, u16 hnum)
+ __be32 daddr, u16 hnum, const int dif)
{
struct sock *sk, *reuse_sk;
bool no_reuseport;
@@ -467,8 +468,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
if (udptable != &udp_table)
return NULL; /* only UDP is supported */
- no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP,
- saddr, sport, daddr, hnum, &sk);
+ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
+ daddr, hnum, dif, &sk);
if (no_reuseport || IS_ERR_OR_NULL(sk))
return sk;
@@ -504,7 +505,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
/* Lookup redirect from BPF */
if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
sk = udp4_lookup_run_bpf(net, udptable, skb,
- saddr, sport, daddr, hnum);
+ saddr, sport, daddr, hnum, dif);
if (sk) {
result = sk;
goto done;
@@ -1725,7 +1726,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
EXPORT_SYMBOL(udp_ioctl);
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
- int noblock, int *off, int *err)
+ int *off, int *err)
{
struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
struct sk_buff_head *queue;
@@ -1734,7 +1735,6 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
int error;
queue = &udp_sk(sk)->reader_queue;
- flags |= noblock ? MSG_DONTWAIT : 0;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
struct sk_buff *skb;
@@ -1804,7 +1804,7 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
struct sk_buff *skb;
int err, used;
- skb = skb_recv_udp(sk, 0, 1, &err);
+ skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
if (!skb)
return err;
@@ -1842,8 +1842,8 @@ EXPORT_SYMBOL(udp_read_sock);
* return it, otherwise we block.
*/
-int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len)
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
@@ -1858,7 +1858,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
try_again:
off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
+ skb = __skb_recv_udp(sk, flags, &off, &err);
if (!skb)
return err;
@@ -1909,7 +1909,7 @@ try_again:
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS, is_udplite);
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -2092,16 +2092,20 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
+ int drop_reason;
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
+ if (rc == -ENOMEM) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
- else
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
@@ -2119,14 +2123,17 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
@@ -2203,8 +2210,10 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
goto drop;
+ }
udp_csum_pull_header(skb);
@@ -2212,11 +2221,12 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
return __udp_queue_rcv_skb(sk, skb);
csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return -1;
}
@@ -2250,7 +2260,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old;
if (dst_hold_safe(dst)) {
- old = xchg(&sk->sk_rx_dst, dst);
+ old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst);
dst_release(old);
return old != dst;
}
@@ -2410,6 +2420,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
bool refcounted;
+ int drop_reason;
+
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/*
* Validate the packet.
@@ -2440,7 +2453,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct dst_entry *dst = skb_dst(skb);
int ret;
- if (unlikely(sk->sk_rx_dst != dst))
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
@@ -2465,6 +2478,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp_lib_checksum_complete(skb))
goto csum_error;
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -2472,10 +2486,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return 0;
short_packet:
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source),
@@ -2488,6 +2503,7 @@ csum_error:
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
@@ -2495,7 +2511,7 @@ csum_error:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return 0;
}
@@ -2547,8 +2563,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
struct sock *sk;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- if (INET_MATCH(sk, net, acookie, rmt_addr,
- loc_addr, ports, dif, sdif))
+ if (inet_match(net, sk, acookie, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
@@ -2599,7 +2614,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_efree;
- dst = READ_ONCE(sk->sk_rx_dst);
+ dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
@@ -2926,6 +2941,7 @@ struct proto udp_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
+ .put_port = udp_lib_unhash,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
@@ -2952,7 +2968,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
- afinfo = PDE_DATA(file_inode(seq->file));
+ afinfo = pde_data(file_inode(seq->file));
for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
++state->bucket) {
@@ -2985,7 +3001,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
- afinfo = PDE_DATA(file_inode(seq->file));
+ afinfo = pde_data(file_inode(seq->file));
do {
sk = sk_next(sk);
@@ -3042,7 +3058,7 @@ void udp_seq_stop(struct seq_file *seq, void *v)
if (state->bpf_seq_afinfo)
afinfo = state->bpf_seq_afinfo;
else
- afinfo = PDE_DATA(file_inode(seq->file));
+ afinfo = pde_data(file_inode(seq->file));
if (state->bucket <= afinfo->udp_table->mask)
spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
@@ -3075,7 +3091,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
{
seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_puts(seq, " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index bbe6569c9ad3..ff15918b7bdc 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -11,14 +11,13 @@
static struct proto *udpv6_prot_saved __read_mostly;
static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
- return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags,
- addr_len);
+ return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len);
#endif
- return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len);
+ return udp_prot.recvmsg(sk, msg, len, flags, addr_len);
}
static bool udp_sk_has_data(struct sock *sk)
@@ -61,7 +60,7 @@ static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
}
static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct sk_psock *psock;
int copied, ret;
@@ -71,10 +70,10 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
psock = sk_psock_get(sk);
if (unlikely(!psock))
- return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return sk_udp_recvmsg(sk, msg, len, flags, addr_len);
if (!psock_has_data(psock)) {
- ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
goto out;
}
@@ -84,12 +83,12 @@ msg_bytes_ready:
long timeo;
int data;
- timeo = sock_rcvtimeo(sk, nonblock);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
data = udp_msg_wait_data(sk, psock, timeo);
if (data) {
if (psock_has_data(psock))
goto msg_bytes_ready;
- ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
goto out;
}
copied = -EAGAIN;
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 2878d8285caf..4ba7a88a1b1d 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -17,8 +17,8 @@ int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
int udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
-int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
void udp_destroy_sock(struct sock *sk);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 86d32a1e62ac..6d1a4bec2614 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -7,6 +7,7 @@
*/
#include <linux/skbuff.h>
+#include <net/gro.h>
#include <net/udp.h>
#include <net/protocol.h>
#include <net/inet_common.h>
@@ -424,6 +425,33 @@ out:
return segs;
}
+static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk owenrship - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
+
#define UDP_GRO_CNT_MAX 64
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct sk_buff *skb)
@@ -600,13 +628,11 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- rcu_read_lock();
if (static_branch_unlikely(&udp_encap_needed_key))
sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
pp = udp_gro_receive(head, skb, uh, sk);
- rcu_read_unlock();
return pp;
flush:
@@ -641,7 +667,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
- rcu_read_lock();
sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
udp4_lib_lookup_skb, skb, uh->source, uh->dest);
if (sk && udp_sk(sk)->gro_complete) {
@@ -662,7 +687,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
} else {
err = udp_gro_complete_segment(skb);
}
- rcu_read_unlock();
if (skb->remcsum_offload)
skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index b91003538d87..bc3a043a5d5c 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -846,7 +846,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
list_for_each_entry(node, &info->shared->devices, list)
if (node->dev == dev)
break;
- if (node->dev != dev)
+ if (list_entry_is_head(node, &info->shared->devices, list))
return;
list_del(&node->list);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 9ebd54752e03..6fde0b184791 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -28,13 +28,11 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = daddr->a4;
fl4->flowi4_tos = tos;
- fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
+ fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(net, oif);
fl4->flowi4_mark = mark;
if (saddr)
fl4->saddr = saddr->a4;
- fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF;
-
rt = __ip_route_output_key(net, fl4);
if (!IS_ERR(rt))
return &rt->dst;
@@ -77,7 +75,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_iif = fl4->flowi4_iif;
xdst->u.dst.dev = dev;
- dev_hold(dev);
+ dev_hold_track(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 2fe5860c21d6..b146ce88c5d0 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -304,4 +304,3 @@ void __init xfrm4_protocol_init(void)
{
xfrm_input_register_afinfo(&xfrm4_input_afinfo);
}
-EXPORT_SYMBOL(xfrm4_protocol_init);