aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig9
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c8
-rw-r--r--net/ipv4/esp4.c14
-rw-r--r--net/ipv4/fib_trie.c3
-rw-r--r--net/ipv4/fou.c423
-rw-r--r--net/ipv4/geneve.c50
-rw-r--r--net/ipv4/gre_offload.c3
-rw-r--r--net/ipv4/icmp.c51
-rw-r--r--net/ipv4/igmp.c34
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/ip_fragment.c19
-rw-r--r--net/ipv4/ip_gre.c11
-rw-r--r--net/ipv4/ip_output.c16
-rw-r--r--net/ipv4/ip_sockglue.c28
-rw-r--r--net/ipv4/ip_tunnel.c122
-rw-r--r--net/ipv4/ipconfig.c19
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/netfilter/Kconfig9
-rw-r--r--net/ipv4/netfilter/Makefile1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c53
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c10
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c24
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c12
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c77
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c7
-rw-r--r--net/ipv4/ping.c13
-rw-r--r--net/ipv4/proc.c11
-rw-r--r--net/ipv4/raw.c109
-rw-r--r--net/ipv4/syncookies.c86
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c75
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_cubic.c31
-rw-r--r--net/ipv4/tcp_input.c49
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv4/tcp_memcontrol.c87
-rw-r--r--net/ipv4/tcp_offload.c2
-rw-r--r--net/ipv4/tcp_output.c154
-rw-r--r--net/ipv4/tcp_timer.c18
-rw-r--r--net/ipv4/udp.c210
-rw-r--r--net/ipv4/udp_offload.c69
45 files changed, 1300 insertions, 649 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e682b48e0709..bd2901604842 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -322,6 +322,15 @@ config NET_FOU
network mechanisms and optimizations for UDP (such as ECMP
and RSS) can be leveraged to provide better service.
+config NET_FOU_IP_TUNNELS
+ bool "IP: FOU encapsulation of IP tunnels"
+ depends on NET_IPIP || NET_IPGRE || IPV6_SIT
+ select NET_FOU
+ ---help---
+ Allow configuration of FOU or GUE encapsulation for IP tunnels.
+ When this option is enabled IP tunnels can be configured to use
+ FOU or GUE encapsulation.
+
config GENEVE
tristate "Generic Network Virtualization Encapsulation (Geneve)"
depends on INET
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e67da4e6c324..a44773c8346c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1222,7 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_TCPV6 |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_MPLS |
+ SKB_GSO_TUNNEL_REMCSUM |
0)))
goto out;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 16acb59d665e..205e1472aa78 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1292,7 +1292,7 @@ static int arp_proc_init(void);
void __init arp_init(void)
{
- neigh_table_init(&arp_tbl);
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
dev_add_pack(&arp_packet_type);
arp_proc_init();
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 4715f25dfe03..5160c710f2eb 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -50,7 +50,7 @@
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <linux/atomic.h>
-#include <asm/bug.h>
+#include <linux/bug.h>
#include <asm/unaligned.h>
/* List of available DOI definitions */
@@ -72,6 +72,7 @@ struct cipso_v4_map_cache_bkt {
u32 size;
struct list_head list;
};
+
struct cipso_v4_map_cache_entry {
u32 hash;
unsigned char *key;
@@ -82,7 +83,8 @@ struct cipso_v4_map_cache_entry {
u32 activity;
struct list_head list;
};
-static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL;
+
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache;
/* Restricted bitmap (tag #1) flags */
int cipso_v4_rbm_optfmt = 0;
@@ -539,7 +541,7 @@ doi_add_return:
/**
* cipso_v4_doi_free - Frees a DOI definition
- * @entry: the entry's RCU field
+ * @doi_def: the DOI definition
*
* Description:
* This function frees all of the memory associated with a DOI definition.
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 360b565918c4..60173d4d3a0e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -392,8 +392,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (elen <= 0)
goto out;
- if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
goto out;
+
nfrags = err;
assoclen = sizeof(*esph);
@@ -601,12 +603,12 @@ static int esp_init_authenc(struct xfrm_state *x)
BUG_ON(!aalg_desc);
err = -EINVAL;
- if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
crypto_aead_authsize(aead)) {
- NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_aead_authsize(aead),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ pr_info("ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_aead_authsize(aead),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
goto free_key;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e9cb2588e416..18bcaf2ff2fd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1143,8 +1143,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
- tp = tn;
}
+
+ tp = tn;
}
if (tp && tp->pos + tp->bits > 32)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 606c520ffd5a..b986298a7ba3 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -38,21 +38,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static int fou_udp_encap_recv_deliver(struct sk_buff *skb,
- u8 protocol, size_t len)
+static void fou_recv_pull(struct sk_buff *skb, size_t len)
{
struct iphdr *iph = ip_hdr(skb);
/* Remove 'len' bytes from the packet (UDP header and
- * FOU header if present), modify the protocol to the one
- * we found, and then call rcv_encap.
+ * FOU header if present).
*/
iph->tot_len = htons(ntohs(iph->tot_len) - len);
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
-
- return -protocol;
}
static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
@@ -62,16 +58,56 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- return fou_udp_encap_recv_deliver(skb, fou->protocol,
- sizeof(struct udphdr));
+ fou_recv_pull(skb, sizeof(struct udphdr));
+
+ return -fou->protocol;
+}
+
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+ void *data, size_t hdrlen, u8 ipproto)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+ __wsum delta;
+
+ if (skb->remcsum_offload) {
+ /* Already processed in GRO path */
+ skb->remcsum_offload = 0;
+ return guehdr;
+ }
+
+ if (!pskb_may_pull(skb, plen))
+ return NULL;
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
+ __skb_checksum_complete(skb);
+
+ delta = remcsum_adjust((void *)guehdr + hdrlen,
+ skb->csum, start, offset);
+
+ /* Adjust skb->csum since we changed the packet */
+ skb->csum = csum_add(skb->csum, delta);
+
+ return guehdr;
+}
+
+static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
+{
+ /* No support yet */
+ kfree_skb(skb);
+ return 0;
}
static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct fou *fou = fou_from_sock(sk);
- size_t len;
+ size_t len, optlen, hdrlen;
struct guehdr *guehdr;
- struct udphdr *uh;
+ void *data;
+ u16 doffset = 0;
if (!fou)
return 1;
@@ -80,25 +116,58 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!pskb_may_pull(skb, len))
goto drop;
- uh = udp_hdr(skb);
- guehdr = (struct guehdr *)&uh[1];
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
- len += guehdr->hlen << 2;
if (!pskb_may_pull(skb, len))
goto drop;
- uh = udp_hdr(skb);
- guehdr = (struct guehdr *)&uh[1];
+ /* guehdr may change after pull */
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
- if (guehdr->version != 0)
- goto drop;
+ hdrlen = sizeof(struct guehdr) + optlen;
- if (guehdr->flags) {
- /* No support yet */
+ if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
goto drop;
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+
+ /* Pull csum through the guehdr now . This can be used if
+ * there is a remote checksum offload.
+ */
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_remcsum(skb, guehdr, data + doffset,
+ hdrlen, guehdr->proto_ctype);
+ if (!guehdr)
+ goto drop;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
}
- return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
+ if (unlikely(guehdr->control))
+ return gue_control_message(skb, guehdr);
+
+ __skb_pull(skb, sizeof(struct udphdr) + hdrlen);
+ skb_reset_transport_header(skb);
+
+ return -guehdr->proto_ctype;
+
drop:
kfree_skb(skb);
return 0;
@@ -149,6 +218,41 @@ out_unlock:
return err;
}
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+ struct guehdr *guehdr, void *data,
+ size_t hdrlen, u8 ipproto)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+ __wsum delta;
+
+ if (skb->remcsum_offload)
+ return guehdr;
+
+ if (!NAPI_GRO_CB(skb)->csum_valid)
+ return NULL;
+
+ /* Pull checksum that will be written */
+ if (skb_gro_header_hard(skb, off + plen)) {
+ guehdr = skb_gro_header_slow(skb, off + plen, off);
+ if (!guehdr)
+ return NULL;
+ }
+
+ delta = remcsum_adjust((void *)guehdr + hdrlen,
+ NAPI_GRO_CB(skb)->csum, start, offset);
+
+ /* Adjust skb->csum since we changed the packet */
+ skb->csum = csum_add(skb->csum, delta);
+ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+
+ skb->remcsum_offload = 1;
+
+ return guehdr;
+}
+
static struct sk_buff **gue_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
@@ -156,38 +260,64 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
- u8 proto;
struct guehdr *guehdr;
- unsigned int hlen, guehlen;
- unsigned int off;
+ size_t len, optlen, hdrlen, off;
+ void *data;
+ u16 doffset = 0;
int flush = 1;
off = skb_gro_offset(skb);
- hlen = off + sizeof(*guehdr);
+ len = off + sizeof(*guehdr);
+
guehdr = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- guehdr = skb_gro_header_slow(skb, hlen, off);
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
if (unlikely(!guehdr))
goto out;
}
- proto = guehdr->next_hdr;
+ optlen = guehdr->hlen << 2;
+ len += optlen;
- rcu_read_lock();
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
- if (WARN_ON(!ops || !ops->callbacks.gro_receive))
- goto out_unlock;
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
- guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+ if (unlikely(guehdr->control) || guehdr->version != 0 ||
+ validate_gue_flags(guehdr, optlen))
+ goto out;
- hlen = off + guehlen;
- if (skb_gro_header_hard(skb, hlen)) {
- guehdr = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!guehdr))
- goto out_unlock;
+ hdrlen = sizeof(*guehdr) + optlen;
+
+ /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+ * this is needed if there is a remote checkcsum offload.
+ */
+ skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_gro_remcsum(skb, off, guehdr,
+ data + doffset, hdrlen,
+ guehdr->proto_ctype);
+ if (!guehdr)
+ goto out;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
}
+ skb_gro_pull(skb, hdrlen);
+
flush = 0;
for (p = *head; p; p = p->next) {
@@ -199,7 +329,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
guehdr2 = (struct guehdr *)(p->data + off);
/* Compare base GUE header to be equal (covers
- * hlen, version, next_hdr, and flags.
+ * hlen, version, proto_ctype, and flags.
*/
if (guehdr->word != guehdr2->word) {
NAPI_GRO_CB(p)->same_flow = 0;
@@ -214,10 +344,11 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
}
}
- skb_gro_pull(skb, guehlen);
-
- /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
- skb_gro_postpull_rcsum(skb, guehdr, guehlen);
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[guehdr->proto_ctype]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
@@ -238,7 +369,7 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff)
u8 proto;
int err = -ENOENT;
- proto = guehdr->next_hdr;
+ proto = guehdr->proto_ctype;
guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
@@ -489,6 +620,200 @@ static const struct genl_ops fou_nl_ops[] = {
},
};
+size_t fou_encap_hlen(struct ip_tunnel_encap *e)
+{
+ return sizeof(struct udphdr);
+}
+EXPORT_SYMBOL(fou_encap_hlen);
+
+size_t gue_encap_hlen(struct ip_tunnel_encap *e)
+{
+ size_t len;
+ bool need_priv = false;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+
+ if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+ len += GUE_PLEN_REMCSUM;
+ need_priv = true;
+ }
+
+ len += need_priv ? GUE_LEN_PRIV : 0;
+
+ return len;
+}
+EXPORT_SYMBOL(gue_encap_hlen);
+
+static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi4 *fl4, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ uh->check = 0;
+ udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+ fl4->saddr, fl4->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(fou_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ struct guehdr *guehdr;
+ size_t hdrlen, optlen = 0;
+ __be16 sport;
+ void *data;
+ bool need_priv = false;
+
+ if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ csum = false;
+ optlen += GUE_PLEN_REMCSUM;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+ need_priv = true;
+ }
+
+ optlen += need_priv ? GUE_LEN_PRIV : 0;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Get source port (based on flow hash) before skb_push */
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
+
+ guehdr = (struct guehdr *)skb->data;
+
+ guehdr->control = 0;
+ guehdr->version = 0;
+ guehdr->hlen = optlen >> 2;
+ guehdr->flags = 0;
+ guehdr->proto_ctype = *protocol;
+
+ data = &guehdr[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+
+ guehdr->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd = data;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
+ }
+
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(gue_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou_build_header,
+};
+
+static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue_build_header,
+};
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou ops\n");
+ return ret;
+ }
+
+ ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue ops\n");
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
static int __init fou_init(void)
{
int ret;
@@ -496,6 +821,14 @@ static int __init fou_init(void)
ret = genl_register_family_with_ops(&fou_nl_family,
fou_nl_ops);
+ if (ret < 0)
+ goto exit;
+
+ ret = ip_tunnel_encap_add_fou_ops();
+ if (ret < 0)
+ genl_unregister_family(&fou_nl_family);
+
+exit:
return ret;
}
@@ -503,6 +836,8 @@ static void __exit fou_fini(void)
{
struct fou *fou, *next;
+ ip_tunnel_encap_del_fou_ops();
+
genl_unregister_family(&fou_nl_family);
/* Close all the FOU sockets */
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index dedb21e99914..394a200f93c1 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -104,7 +104,7 @@ static void geneve_build_header(struct genevehdr *geneveh,
memcpy(geneveh->options, options, options_len);
}
-/* Transmit a fully formated Geneve frame.
+/* Transmit a fully formatted Geneve frame.
*
* When calling this function. The skb->data should point
* to the geneve header which is fully formed.
@@ -122,25 +122,23 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
int err;
skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
err = skb_cow_head(skb, min_headroom);
- if (unlikely(err))
+ if (unlikely(err)) {
+ kfree_skb(skb);
return err;
-
- if (vlan_tx_tag_present(skb)) {
- if (unlikely(!__vlan_put_tag(skb,
- skb->vlan_proto,
- vlan_tx_tag_get(skb)))) {
- err = -ENOMEM;
- return err;
- }
- skb->vlan_tci = 0;
}
+ skb = vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb))
+ return -ENOMEM;
+
gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
@@ -165,6 +163,15 @@ static void geneve_notify_add_rx_port(struct geneve_sock *gs)
}
}
+static void geneve_notify_del_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+
+ if (sa_family == AF_INET)
+ udp_del_offload(&gs->udp_offloads);
+}
+
/* Callback from net/ipv4/udp.c to receive packets */
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
@@ -293,6 +300,7 @@ struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
geneve_rcv_t *rcv, void *data,
bool no_share, bool ipv6)
{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_sock *gs;
gs = geneve_socket_create(net, port, rcv, data, ipv6);
@@ -302,15 +310,15 @@ struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
if (no_share) /* Return error if sharing is not allowed. */
return ERR_PTR(-EINVAL);
+ spin_lock(&gn->sock_lock);
gs = geneve_find_sock(net, port);
- if (gs) {
- if (gs->rcv == rcv)
- atomic_inc(&gs->refcnt);
- else
+ if (gs && ((gs->rcv != rcv) ||
+ !atomic_add_unless(&gs->refcnt, 1, 0)))
gs = ERR_PTR(-EBUSY);
- } else {
+ spin_unlock(&gn->sock_lock);
+
+ if (!gs)
gs = ERR_PTR(-EINVAL);
- }
return gs;
}
@@ -318,9 +326,17 @@ EXPORT_SYMBOL_GPL(geneve_sock_add);
void geneve_sock_release(struct geneve_sock *gs)
{
+ struct net *net = sock_net(gs->sock->sk);
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+
if (!atomic_dec_and_test(&gs->refcnt))
return;
+ spin_lock(&gn->sock_lock);
+ hlist_del_rcu(&gs->hlist);
+ geneve_notify_del_rx_port(gs);
+ spin_unlock(&gn->sock_lock);
+
queue_work(geneve_wq, &gs->del_work);
}
EXPORT_SYMBOL_GPL(geneve_sock_release);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index bb5947b0ce2d..51973ddc05a6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -247,6 +247,9 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
rcu_read_unlock();
+
+ skb_set_inner_mac_header(skb, nhoff + grehlen);
+
return err;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5882f584910e..36f5584d93c5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert);
*/
struct icmp_control {
- void (*handler)(struct sk_buff *skb);
+ bool (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
@@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto)
* ICMP_PARAMETERPROB.
*/
-static void icmp_unreach(struct sk_buff *skb)
+static bool icmp_unreach(struct sk_buff *skb)
{
const struct iphdr *iph;
struct icmphdr *icmph;
@@ -784,8 +784,8 @@ static void icmp_unreach(struct sk_buff *skb)
*/
switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
default:
- LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
- &iph->daddr);
+ net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
+ &iph->daddr);
break;
case 2:
goto out;
@@ -798,8 +798,8 @@ static void icmp_unreach(struct sk_buff *skb)
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"),
- &iph->daddr);
+ net_dbg_ratelimited("%pI4: Source Route Failed\n",
+ &iph->daddr);
break;
default:
break;
@@ -839,10 +839,10 @@ static void icmp_unreach(struct sk_buff *skb)
icmp_socket_deliver(skb, info);
out:
- return;
+ return true;
out_err:
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
- goto out;
+ return false;
}
@@ -850,17 +850,20 @@ out_err:
* Handle ICMP_REDIRECT.
*/
-static void icmp_redirect(struct sk_buff *skb)
+static bool icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
- return;
+ return false;
}
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return;
+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+ /* there aught to be a stat */
+ return false;
+ }
icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ return true;
}
/*
@@ -875,7 +878,7 @@ static void icmp_redirect(struct sk_buff *skb)
* See also WRT handling of options once they are done and working.
*/
-static void icmp_echo(struct sk_buff *skb)
+static bool icmp_echo(struct sk_buff *skb)
{
struct net *net;
@@ -891,6 +894,8 @@ static void icmp_echo(struct sk_buff *skb)
icmp_param.head_len = sizeof(struct icmphdr);
icmp_reply(&icmp_param, skb);
}
+ /* should there be an ICMP stat for ignored echos? */
+ return true;
}
/*
@@ -900,7 +905,7 @@ static void icmp_echo(struct sk_buff *skb)
* MUST be accurate to a few minutes.
* MUST be updated at least at 15Hz.
*/
-static void icmp_timestamp(struct sk_buff *skb)
+static bool icmp_timestamp(struct sk_buff *skb)
{
struct timespec tv;
struct icmp_bxm icmp_param;
@@ -927,15 +932,17 @@ static void icmp_timestamp(struct sk_buff *skb)
icmp_param.data_len = 0;
icmp_param.head_len = sizeof(struct icmphdr) + 12;
icmp_reply(&icmp_param, skb);
-out:
- return;
+ return true;
+
out_err:
ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
- goto out;
+ return false;
}
-static void icmp_discard(struct sk_buff *skb)
+static bool icmp_discard(struct sk_buff *skb)
{
+ /* pretend it was a success */
+ return true;
}
/*
@@ -946,6 +953,7 @@ int icmp_rcv(struct sk_buff *skb)
struct icmphdr *icmph;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
+ bool success;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
@@ -1012,7 +1020,12 @@ int icmp_rcv(struct sk_buff *skb)
}
}
- icmp_pointers[icmph->type].handler(skb);
+ success = icmp_pointers[icmph->type].handler(skb);
+
+ if (success) {
+ consume_skb(skb);
+ return 0;
+ }
drop:
kfree_skb(skb);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index bb15d0e03d4f..666cf364df86 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -112,17 +112,17 @@
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
-#define IGMP_V1_Router_Present_Timeout (400*HZ)
-#define IGMP_V2_Router_Present_Timeout (400*HZ)
-#define IGMP_V2_Unsolicited_Report_Interval (10*HZ)
-#define IGMP_V3_Unsolicited_Report_Interval (1*HZ)
-#define IGMP_Query_Response_Interval (10*HZ)
-#define IGMP_Query_Robustness_Variable 2
+#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ)
+#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ)
+#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
+#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
+#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
+#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2
-#define IGMP_Initial_Report_Delay (1)
+#define IGMP_INITIAL_REPORT_DELAY (1)
-/* IGMP_Initial_Report_Delay is not from IGMP specs!
+/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
* IGMP specs require to report membership immediately after
* joining a group, but we delay the first report by a
* small interval. It seems more natural and still does not
@@ -878,15 +878,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (ih->code == 0) {
/* Alas, old v1 router presents here. */
- max_delay = IGMP_Query_Response_Interval;
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_v1_seen = jiffies +
- IGMP_V1_Router_Present_Timeout;
+ IGMP_V1_ROUTER_PRESENT_TIMEOUT;
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
in_dev->mr_v2_seen = jiffies +
- IGMP_V2_Router_Present_Timeout;
+ IGMP_V2_ROUTER_PRESENT_TIMEOUT;
}
/* cancel the interface change timer */
in_dev->mr_ifc_count = 0;
@@ -898,7 +898,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
return true; /* ignore bogus packet; freed by caller */
} else if (IGMP_V1_SEEN(in_dev)) {
/* This is a v3 query with v1 queriers present */
- max_delay = IGMP_Query_Response_Interval;
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
group = 0;
} else if (IGMP_V2_SEEN(in_dev)) {
/* this is a v3 query with v2 queriers present;
@@ -1217,7 +1217,7 @@ static void igmp_group_added(struct ip_mc_list *im)
return;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
- igmp_start_timer(im, IGMP_Initial_Report_Delay);
+ igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
spin_unlock_bh(&im->lock);
return;
}
@@ -1540,7 +1540,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
#ifdef CONFIG_IP_MULTICAST
-int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable;
+int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
#endif
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
@@ -2686,11 +2686,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
- seq_printf(seq,
- "%3s %6s "
- "%10s %10s %6s %6s\n", "Idx",
- "Device", "MCA",
- "SRC", "INC", "EXC");
+ seq_puts(seq, "Idx Device MCA SRC INC EXC\n");
} else {
seq_printf(seq,
"%3d %6.6s 0x%08x "
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 19419b60cb37..e7920352646a 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -458,6 +458,6 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
". Dropping fragment.\n";
if (PTR_ERR(q) == -ENOBUFS)
- LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
+ net_dbg_ratelimited("%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 2811cc18701a..e5b6d0ddcb58 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -80,7 +80,7 @@ struct ipq {
struct inet_peer *peer;
};
-static inline u8 ip4_frag_ecn(u8 tos)
+static u8 ip4_frag_ecn(u8 tos)
{
return 1 << (tos & INET_ECN_MASK);
}
@@ -148,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
}
-static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
+static void ip4_frag_free(struct inet_frag_queue *q)
{
struct ipq *qp;
@@ -160,7 +160,7 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
/* Destruction primitives. */
-static __inline__ void ipq_put(struct ipq *ipq)
+static void ipq_put(struct ipq *ipq)
{
inet_frag_put(&ipq->q, &ip4_frags);
}
@@ -236,7 +236,7 @@ out:
/* Find the correct entry in the "incomplete datagrams" queue for
* this IP datagram, and create new one, if nothing is found.
*/
-static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
struct inet_frag_queue *q;
struct ip4_create_arg arg;
@@ -256,7 +256,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
}
/* Is the fragment too far ahead to be part of ipq? */
-static inline int ip_frag_too_far(struct ipq *qp)
+static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
unsigned int max = sysctl_ipfrag_max_dist;
@@ -618,8 +618,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
return 0;
out_nomem:
- LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
- qp);
+ net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
@@ -795,16 +794,16 @@ static void __init ip4_frags_ctl_register(void)
register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
}
#else
-static inline int ip4_frags_ns_ctl_register(struct net *net)
+static int ip4_frags_ns_ctl_register(struct net *net)
{
return 0;
}
-static inline void ip4_frags_ns_ctl_unregister(struct net *net)
+static void ip4_frags_ns_ctl_unregister(struct net *net)
{
}
-static inline void __init ip4_frags_ctl_register(void)
+static void __init ip4_frags_ctl_register(void)
{
}
#endif
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 12055fdbe716..4f4bf5b99686 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -252,10 +252,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
- skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
- if (IS_ERR(skb))
- goto out;
-
if (dev->header_ops) {
/* Need space for new headers */
if (skb_cow_head(skb, dev->needed_headroom -
@@ -268,6 +264,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
* to gre header.
*/
skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ skb_reset_mac_header(skb);
} else {
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
@@ -275,6 +272,10 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
tnl_params = &tunnel->parms.iph;
}
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
+
__gre_xmit(skb, dev, tnl_params, skb->protocol);
return NETDEV_TX_OK;
@@ -789,7 +790,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT,
t->encap.dport) ||
nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
- t->encap.dport))
+ t->encap.flags))
goto nla_put_failure;
return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bc6471d4abcd..b50861b22b6b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -662,12 +662,10 @@ slow_path:
if (len < left) {
len &= ~7;
}
- /*
- * Allocate buffer.
- */
- if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
- NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
+ if (!skb2) {
err = -ENOMEM;
goto fail;
}
@@ -754,14 +752,16 @@ EXPORT_SYMBOL(ip_fragment);
int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
- struct iovec *iov = from;
+ struct msghdr *msg = from;
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+ /* XXX: stripping const */
+ if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
return -EFAULT;
} else {
__wsum csum = 0;
- if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+ /* XXX: stripping const */
+ if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, odd);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 9daf2177dc00..8a89c738b7a3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -192,7 +192,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
int err, val;
struct cmsghdr *cmsg;
- for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf
kfree_skb(skb);
}
+static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk,
+ const struct sk_buff *skb,
+ int ee_origin)
+{
+ struct in_pktinfo *info = PKTINFO_SKB_CB(skb);
+
+ if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) ||
+ (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
+ (!skb->dev))
+ return false;
+
+ info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
+ info->ipi_ifindex = skb->dev->ifindex;
+ return true;
+}
+
/*
* Handle MSG_ERRQUEUE
*/
@@ -414,6 +430,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
int err;
int copied;
+ WARN_ON_ONCE(sk->sk_family == AF_INET6);
+
err = -EAGAIN;
skb = sock_dequeue_err_skb(sk);
if (skb == NULL)
@@ -424,7 +442,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto out_free_skb;
@@ -444,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
sin = &errhdr.offender;
sin->sin_family = AF_UNSPEC;
- if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) {
struct inet_sock *inet = inet_sk(sk);
sin->sin_family = AF_INET;
@@ -1049,7 +1069,7 @@ e_inval:
}
/**
- * ipv4_pktinfo_prepare - transfert some info from rtable to skb
+ * ipv4_pktinfo_prepare - transfer some info from rtable to skb
* @sk: socket
* @skb: buffer
*
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 0bb8e141eacc..d3e447936720 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -56,7 +56,6 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
-#include <net/gue.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -491,17 +490,56 @@ EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
static int ip_encap_hlen(struct ip_tunnel_encap *e)
{
- switch (e->type) {
- case TUNNEL_ENCAP_NONE:
+ const struct ip_tunnel_encap_ops *ops;
+ int hlen = -EINVAL;
+
+ if (e->type == TUNNEL_ENCAP_NONE)
return 0;
- case TUNNEL_ENCAP_FOU:
- return sizeof(struct udphdr);
- case TUNNEL_ENCAP_GUE:
- return sizeof(struct udphdr) + sizeof(struct guehdr);
- default:
+
+ if (e->type >= MAX_IPTUN_ENCAP_OPS)
return -EINVAL;
- }
+
+ rcu_read_lock();
+ ops = rcu_dereference(iptun_encaps[e->type]);
+ if (likely(ops && ops->encap_hlen))
+ hlen = ops->encap_hlen(e);
+ rcu_read_unlock();
+
+ return hlen;
+}
+
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+
+int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ NULL, ops) ? 0 : -1;
}
+EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
+
+int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
int ip_tunnel_encap_setup(struct ip_tunnel *t,
struct ip_tunnel_encap *ipencap)
@@ -526,63 +564,25 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t,
}
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
-static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
- size_t hdr_len, u8 *protocol, struct flowi4 *fl4)
-{
- struct udphdr *uh;
- __be16 sport;
- bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
- int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
-
- skb = iptunnel_handle_offloads(skb, csum, type);
-
- if (IS_ERR(skb))
- return PTR_ERR(skb);
-
- /* Get length and hash before making space in skb */
-
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
-
- skb_push(skb, hdr_len);
-
- skb_reset_transport_header(skb);
- uh = udp_hdr(skb);
-
- if (e->type == TUNNEL_ENCAP_GUE) {
- struct guehdr *guehdr = (struct guehdr *)&uh[1];
-
- guehdr->version = 0;
- guehdr->hlen = 0;
- guehdr->flags = 0;
- guehdr->next_hdr = *protocol;
- }
-
- uh->dest = e->dport;
- uh->source = sport;
- uh->len = htons(skb->len);
- uh->check = 0;
- udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
- fl4->saddr, fl4->daddr, skb->len);
-
- *protocol = IPPROTO_UDP;
-
- return 0;
-}
-
int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
u8 *protocol, struct flowi4 *fl4)
{
- switch (t->encap.type) {
- case TUNNEL_ENCAP_NONE:
+ const struct ip_tunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (t->encap.type == TUNNEL_ENCAP_NONE)
return 0;
- case TUNNEL_ENCAP_FOU:
- case TUNNEL_ENCAP_GUE:
- return fou_build_header(skb, &t->encap, t->encap_hlen,
- protocol, fl4);
- default:
+
+ if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
return -EINVAL;
- }
+
+ rcu_read_lock();
+ ops = rcu_dereference(iptun_encaps[t->encap.type]);
+ if (likely(ops && ops->build_header))
+ ret = ops->build_header(skb, &t->encap, protocol, fl4);
+ rcu_read_unlock();
+
+ return ret;
}
EXPORT_SYMBOL(ip_tunnel_encap);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 648fa1490ea7..7fa18bc7e47f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -115,7 +115,7 @@
*/
int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
-static int ic_enable __initdata = 0; /* IP config enabled? */
+static int ic_enable __initdata; /* IP config enabled? */
/* Protocol choice */
int ic_proto_enabled __initdata = 0
@@ -130,7 +130,7 @@ int ic_proto_enabled __initdata = 0
#endif
;
-static int ic_host_name_set __initdata = 0; /* Host name set by us? */
+static int ic_host_name_set __initdata; /* Host name set by us? */
__be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
@@ -160,17 +160,17 @@ static u8 ic_domain[64]; /* DNS (not NIS) domain name */
static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
/* Protocols supported by available interfaces */
-static int ic_proto_have_if __initdata = 0;
+static int ic_proto_have_if __initdata;
/* MTU for boot device */
-static int ic_dev_mtu __initdata = 0;
+static int ic_dev_mtu __initdata;
#ifdef IPCONFIG_DYNAMIC
static DEFINE_SPINLOCK(ic_recv_lock);
-static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */
+static volatile int ic_got_reply __initdata; /* Proto(s) that replied */
#endif
#ifdef IPCONFIG_DHCP
-static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */
+static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */
#endif
@@ -186,8 +186,8 @@ struct ic_device {
__be32 xid;
};
-static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
-static struct net_device *ic_dev __initdata = NULL; /* Selected device */
+static struct ic_device *ic_first_dev __initdata; /* List of open device */
+static struct net_device *ic_dev __initdata; /* Selected device */
static bool __init ic_is_init_dev(struct net_device *dev)
{
@@ -498,7 +498,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
struct arphdr *rarp;
unsigned char *rarp_ptr;
__be32 sip, tip;
- unsigned char *sha, *tha; /* s for "source", t for "target" */
+ unsigned char *tha; /* t for "target" */
struct ic_device *d;
if (!net_eq(dev_net(dev), &init_net))
@@ -549,7 +549,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
goto drop_unlock; /* should never happen */
/* Extract variable-width fields */
- sha = rarp_ptr;
rarp_ptr += dev->addr_len;
memcpy(&sip, rarp_ptr, 4);
rarp_ptr += 4;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 37096d64730e..40403114f00a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -465,7 +465,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT,
tunnel->encap.dport) ||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
- tunnel->encap.dport))
+ tunnel->encap.flags))
goto nla_put_failure;
return 0;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4c019d5c3f57..59f883d9cadf 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -113,6 +113,15 @@ config NFT_MASQ_IPV4
This is the expression that provides IPv4 masquerading support for
nf_tables.
+config NFT_REDIR_IPV4
+ tristate "IPv4 redirect support for nf_tables"
+ depends on NF_TABLES_IPV4
+ depends on NFT_REDIR
+ select NF_NAT_REDIRECT
+ help
+ This is the expression that provides IPv4 redirect support for
+ nf_tables.
+
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
depends on NF_CONNTRACK_SNMP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index f4cef5af0969..7fe6c703528f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
+obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
# generic IP tables
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index a054fe083431..5c61328b7704 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -56,11 +56,11 @@ static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
return true;
}
-static int ipv4_print_tuple(struct seq_file *s,
+static void ipv4_print_tuple(struct seq_file *s,
const struct nf_conntrack_tuple *tuple)
{
- return seq_printf(s, "src=%pI4 dst=%pI4 ",
- &tuple->src.u3.ip, &tuple->dst.u3.ip);
+ seq_printf(s, "src=%pI4 dst=%pI4 ",
+ &tuple->src.u3.ip, &tuple->dst.u3.ip);
}
static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 4c48e434bb1f..a460a87e14f8 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -94,7 +94,7 @@ static void ct_seq_stop(struct seq_file *s, void *v)
}
#ifdef CONFIG_NF_CONNTRACK_SECMARK
-static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
{
int ret;
u32 len;
@@ -102,17 +102,15 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
if (ret)
- return 0;
+ return;
- ret = seq_printf(s, "secctx=%s ", secctx);
+ seq_printf(s, "secctx=%s ", secctx);
security_release_secctx(secctx, len);
- return ret;
}
#else
-static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
{
- return 0;
}
#endif
@@ -141,47 +139,52 @@ static int ct_seq_show(struct seq_file *s, void *v)
NF_CT_ASSERT(l4proto);
ret = -ENOSPC;
- if (seq_printf(s, "%-8s %u %ld ",
- l4proto->name, nf_ct_protonum(ct),
- timer_pending(&ct->timeout)
- ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
- goto release;
+ seq_printf(s, "%-8s %u %ld ",
+ l4proto->name, nf_ct_protonum(ct),
+ timer_pending(&ct->timeout)
+ ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
+
+ if (l4proto->print_conntrack)
+ l4proto->print_conntrack(s, ct);
- if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
+ if (seq_has_overflowed(s))
goto release;
- if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- l3proto, l4proto))
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
goto release;
if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
goto release;
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
- if (seq_printf(s, "[UNREPLIED] "))
- goto release;
+ seq_printf(s, "[UNREPLIED] ");
- if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- l3proto, l4proto))
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
goto release;
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
goto release;
if (test_bit(IPS_ASSURED_BIT, &ct->status))
- if (seq_printf(s, "[ASSURED] "))
- goto release;
+ seq_printf(s, "[ASSURED] ");
#ifdef CONFIG_NF_CONNTRACK_MARK
- if (seq_printf(s, "mark=%u ", ct->mark))
- goto release;
+ seq_printf(s, "mark=%u ", ct->mark);
#endif
- if (ct_show_secctx(s, ct))
- goto release;
+ ct_show_secctx(s, ct);
- if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
+ seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+
+ if (seq_has_overflowed(s))
goto release;
+
ret = 0;
release:
nf_ct_put(ct);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index b91b2641adda..80d5554b9a88 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -72,13 +72,13 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
}
/* Print out the per-protocol part of the tuple. */
-static int icmp_print_tuple(struct seq_file *s,
+static void icmp_print_tuple(struct seq_file *s,
const struct nf_conntrack_tuple *tuple)
{
- return seq_printf(s, "type=%u code=%u id=%u ",
- tuple->dst.u.icmp.type,
- tuple->dst.u.icmp.code,
- ntohs(tuple->src.u.icmp.id));
+ seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
}
static unsigned int *icmp_get_timeouts(struct net *net)
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index ccfc78db12ee..d059182c1466 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -10,6 +10,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
@@ -74,12 +75,12 @@ static void dump_arp_packet(struct nf_log_buf *m,
ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
}
-void nf_log_arp_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
+static void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
{
struct nf_log_buf *m;
@@ -130,8 +131,17 @@ static int __init nf_log_arp_init(void)
if (ret < 0)
return ret;
- nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ return ret;
}
static void __exit nf_log_arp_exit(void)
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 078bdca1b607..75101980eeee 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -5,6 +5,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
@@ -366,8 +367,17 @@ static int __init nf_log_ipv4_init(void)
if (ret < 0)
return ret;
- nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ return ret;
}
static void __exit nf_log_ipv4_exit(void)
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 1baaa83dfe5c..536da7bc598a 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -11,6 +11,7 @@
#include <net/tcp.h>
#include <net/route.h>
#include <net/dst.h>
+#include <net/netfilter/ipv4/nf_reject.h>
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/ipv4/nf_reject.h>
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
new file mode 100644
index 000000000000..ff2d23d8c87a
--- /dev/null
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_redirect.h>
+#include <net/netfilter/nft_redir.h>
+
+static void nft_redir_ipv4_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_ipv4_multi_range_compat mr;
+ unsigned int verdict;
+
+ memset(&mr, 0, sizeof(mr));
+ if (priv->sreg_proto_min) {
+ mr.range[0].min.all = (__force __be16)
+ data[priv->sreg_proto_min].data[0];
+ mr.range[0].max.all = (__force __be16)
+ data[priv->sreg_proto_max].data[0];
+ mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+
+ mr.range[0].flags |= priv->flags;
+
+ verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum);
+ data[NFT_REG_VERDICT].verdict = verdict;
+}
+
+static struct nft_expr_type nft_redir_ipv4_type;
+static const struct nft_expr_ops nft_redir_ipv4_ops = {
+ .type = &nft_redir_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_ipv4_eval,
+ .init = nft_redir_init,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+};
+
+static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "redir",
+ .ops = &nft_redir_ipv4_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_redir_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_redir_ipv4_type);
+}
+
+static void __exit nft_redir_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_redir_ipv4_type);
+}
+
+module_init(nft_redir_ipv4_module_init);
+module_exit(nft_redir_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index ed33299c56d1..d729542bd1b7 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -19,9 +19,9 @@
#include <net/netfilter/ipv4/nf_reject.h>
#include <net/netfilter/nft_reject.h>
-void nft_reject_ipv4_eval(const struct nft_expr *expr,
- struct nft_data data[NFT_REG_MAX + 1],
- const struct nft_pktinfo *pkt)
+static void nft_reject_ipv4_eval(const struct nft_expr *expr,
+ struct nft_data data[NFT_REG_MAX + 1],
+ const struct nft_pktinfo *pkt)
{
struct nft_reject *priv = nft_expr_priv(expr);
@@ -36,7 +36,6 @@ void nft_reject_ipv4_eval(const struct nft_expr *expr,
data[NFT_REG_VERDICT].verdict = NF_DROP;
}
-EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval);
static struct nft_expr_type nft_reject_ipv4_type;
static const struct nft_expr_ops nft_reject_ipv4_ops = {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 5d740cccf69e..c0d82f78d364 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -662,7 +662,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
* Fetch the ICMP header provided by the userland.
* iovec is modified! The ICMP header is consumed.
*/
- if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len))
+ if (memcpy_from_msg(user_icmph, msg, icmph_len))
return -EFAULT;
if (family == AF_INET) {
@@ -811,7 +811,8 @@ back_from_confirm:
pfh.icmph.checksum = 0;
pfh.icmph.un.echo.id = inet->inet_sport;
pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
- pfh.iov = msg->msg_iov;
+ /* XXX: stripping const */
+ pfh.iov = (struct iovec *)msg->msg_iter.iov;
pfh.wcheck = 0;
pfh.family = AF_INET;
@@ -869,7 +870,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
/* Don't bother checking the checksum */
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto done;
@@ -949,7 +950,7 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
* All we need to do is get the socket.
*/
-void ping_rcv(struct sk_buff *skb)
+bool ping_rcv(struct sk_buff *skb)
{
struct sock *sk;
struct net *net = dev_net(skb->dev);
@@ -968,11 +969,11 @@ void ping_rcv(struct sk_buff *skb)
pr_debug("rcv on socket %p\n", sk);
ping_queue_rcv_skb(sk, skb_get(skb));
sock_put(sk);
- return;
+ return true;
}
pr_debug("no socket, dropping\n");
- /* We're called from icmp_rcv(). kfree_skb() is done there. */
+ return false;
}
EXPORT_SYMBOL_GPL(ping_rcv);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8e3eb39f84e7..8f9cd200ce20 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -181,6 +181,7 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
SNMP_MIB_SENTINEL
};
@@ -287,6 +288,10 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
+ SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT),
+ SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
+ SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
+ SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
SNMP_MIB_SENTINEL
};
@@ -296,12 +301,12 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
int j;
if (count) {
- seq_printf(seq, "\nIcmpMsg:");
+ seq_puts(seq, "\nIcmpMsg:");
for (j = 0; j < count; ++j)
seq_printf(seq, " %sType%u",
type[j] & 0x100 ? "Out" : "In",
type[j] & 0xff);
- seq_printf(seq, "\nIcmpMsg:");
+ seq_puts(seq, "\nIcmpMsg:");
for (j = 0; j < count; ++j)
seq_printf(seq, " %lu", vals[j]);
}
@@ -342,7 +347,7 @@ static void icmp_put(struct seq_file *seq)
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
- seq_printf(seq, " OutMsgs OutErrors");
+ seq_puts(seq, " OutMsgs OutErrors");
for (i = 0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu %lu",
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 739db3100c23..0bb68df5055d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -79,6 +79,16 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
+#include <linux/uio.h>
+
+struct raw_frag_vec {
+ struct msghdr *msg;
+ union {
+ struct icmphdr icmph;
+ char c[1];
+ } hdr;
+ int hlen;
+};
static struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -420,53 +430,57 @@ error:
return err;
}
-static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
+static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
{
- struct iovec *iov;
- u8 __user *type = NULL;
- u8 __user *code = NULL;
- int probed = 0;
- unsigned int i;
+ int err;
- if (!msg->msg_iov)
+ if (fl4->flowi4_proto != IPPROTO_ICMP)
return 0;
- for (i = 0; i < msg->msg_iovlen; i++) {
- iov = &msg->msg_iov[i];
- if (!iov)
- continue;
-
- switch (fl4->flowi4_proto) {
- case IPPROTO_ICMP:
- /* check if one-byte field is readable or not. */
- if (iov->iov_base && iov->iov_len < 1)
- break;
-
- if (!type) {
- type = iov->iov_base;
- /* check if code field is readable or not. */
- if (iov->iov_len > 1)
- code = type + 1;
- } else if (!code)
- code = iov->iov_base;
-
- if (type && code) {
- if (get_user(fl4->fl4_icmp_type, type) ||
- get_user(fl4->fl4_icmp_code, code))
- return -EFAULT;
- probed = 1;
- }
- break;
- default:
- probed = 1;
- break;
- }
- if (probed)
- break;
- }
+ /* We only need the first two bytes. */
+ rfv->hlen = 2;
+
+ err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
+ if (err)
+ return err;
+
+ fl4->fl4_icmp_type = rfv->hdr.icmph.type;
+ fl4->fl4_icmp_code = rfv->hdr.icmph.code;
+
return 0;
}
+static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct raw_frag_vec *rfv = from;
+
+ if (offset < rfv->hlen) {
+ int copy = min(rfv->hlen - offset, len);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ memcpy(to, rfv->hdr.c + offset, copy);
+ else
+ skb->csum = csum_block_add(
+ skb->csum,
+ csum_partial_copy_nocheck(rfv->hdr.c + offset,
+ to, copy, 0),
+ odd);
+
+ odd = 0;
+ offset += copy;
+ to += copy;
+ len -= copy;
+
+ if (!len)
+ return 0;
+ }
+
+ offset -= rfv->hlen;
+
+ return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
+}
+
static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -480,6 +494,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
u8 tos;
int err;
struct ip_options_data opt_copy;
+ struct raw_frag_vec rfv;
err = -EMSGSIZE;
if (len > 0xFFFF)
@@ -585,7 +600,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
daddr, saddr, 0, 0);
if (!inet->hdrincl) {
- err = raw_probe_proto_opt(&fl4, msg);
+ rfv.msg = msg;
+ rfv.hlen = 0;
+
+ err = raw_probe_proto_opt(&rfv, &fl4);
if (err)
goto done;
}
@@ -607,7 +625,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
back_from_confirm:
if (inet->hdrincl)
- err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
+ /* XXX: stripping const */
+ err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len,
&rt, msg->msg_flags);
else {
@@ -616,8 +635,8 @@ back_from_confirm:
if (!ipc.addr)
ipc.addr = fl4.daddr;
lock_sock(sk);
- err = ip_append_data(sk, &fl4, ip_generic_getfrag,
- msg->msg_iov, len, 0,
+ err = ip_append_data(sk, &fl4, raw_getfrag,
+ &rfv, len, 0,
&ipc, &rt, msg->msg_flags);
if (err)
ip_flush_pending_frames(sk);
@@ -718,7 +737,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
copied = len;
}
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err)
goto done;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 32b98d0207b4..45fe60c5238e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -19,10 +19,6 @@
#include <net/tcp.h>
#include <net/route.h>
-/* Timestamps: lowest bits store TCP options */
-#define TSBITS 6
-#define TSMASK (((__u32)1 << TSBITS) - 1)
-
extern int sysctl_tcp_syncookies;
static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
@@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK
+ * stores TCP options:
+ *
+ * MSB LSB
+ * | 31 ... 6 | 5 | 4 | 3 2 1 0 |
+ * | Timestamp | ECN | SACK | WScale |
+ *
+ * When we receive a valid cookie-ACK, we look at the echoed tsval (if
+ * any) to figure out which TCP options we should use for the rebuilt
+ * connection.
+ *
+ * A WScale setting of '0xf' (which is an invalid scaling value)
+ * means that original syn did not include the TCP window scaling option.
+ */
+#define TS_OPT_WSCALE_MASK 0xf
+#define TS_OPT_SACK BIT(4)
+#define TS_OPT_ECN BIT(5)
+/* There is no TS_OPT_TIMESTAMP:
+ * if ACK contains timestamp option, we already know it was
+ * requested/supported by the syn/synack exchange.
+ */
+#define TSBITS 6
+#define TSMASK (((__u32)1 << TSBITS) - 1)
+
static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
ipv4_cookie_scratch);
@@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req)
ireq = inet_rsk(req);
- options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
- options |= ireq->sack_ok << 4;
- options |= ireq->ecn_ok << 5;
+ options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
+ if (ireq->sack_ok)
+ options |= TS_OPT_SACK;
+ if (ireq->ecn_ok)
+ options |= TS_OPT_ECN;
ts = ts_now & ~TSMASK;
ts |= options;
@@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
* additional tcp options in the timestamp.
* This extracts these options from the timestamp echo.
*
- * The lowest 4 bits store snd_wscale.
- * next 2 bits indicate SACK and ECN support.
- *
- * return false if we decode an option that should not be.
+ * return false if we decode a tcp option that is disabled
+ * on the host.
*/
-bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
- struct net *net, bool *ecn_ok)
+bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
{
/* echoed timestamp, lowest bits contain options */
- u32 options = tcp_opt->rcv_tsecr & TSMASK;
+ u32 options = tcp_opt->rcv_tsecr;
if (!tcp_opt->saw_tstamp) {
tcp_clear_options(tcp_opt);
@@ -238,22 +257,35 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
if (!sysctl_tcp_timestamps)
return false;
- tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
- *ecn_ok = (options >> 5) & 1;
- if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
- return false;
+ tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
if (tcp_opt->sack_ok && !sysctl_tcp_sack)
return false;
- if ((options & 0xf) == 0xf)
+ if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
return true; /* no window scaling */
tcp_opt->wscale_ok = 1;
- tcp_opt->snd_wscale = options & 0xf;
+ tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
+
return sysctl_tcp_window_scaling != 0;
}
-EXPORT_SYMBOL(cookie_check_timestamp);
+EXPORT_SYMBOL(cookie_timestamp_decode);
+
+bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
+ const struct net *net, const struct dst_entry *dst)
+{
+ bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
+
+ if (!ecn_ok)
+ return false;
+
+ if (net->ipv4.sysctl_tcp_ecn)
+ return true;
+
+ return dst_feature(dst, RTAX_FEATURE_ECN);
+}
+EXPORT_SYMBOL(cookie_ecn_ok);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
@@ -269,14 +301,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
int mss;
struct rtable *rt;
__u8 rcv_wscale;
- bool ecn_ok = false;
struct flowi4 fl4;
if (!sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
- if (tcp_synq_no_recent_overflow(sk) ||
- (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {
+ if (tcp_synq_no_recent_overflow(sk))
+ goto out;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
+ if (mss == 0) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
@@ -287,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
memset(&tcp_opt, 0, sizeof(tcp_opt));
tcp_parse_options(skb, &tcp_opt, 0, NULL);
- if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
+ if (!cookie_timestamp_decode(&tcp_opt))
goto out;
ret = NULL;
@@ -305,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->ir_loc_addr = ip_hdr(skb)->daddr;
ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
ireq->ir_mark = inet_request_mark(sk, skb);
- ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok;
@@ -354,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
dst_metric(&rt->dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
ret = get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b3c53c8b331e..e0ee384a448f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_max_reordering",
+ .data = &sysctl_tcp_max_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 38c2bcb8dd5d..3075723c729b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
int large_allowed)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 xmit_size_goal, old_size_goal;
-
- xmit_size_goal = mss_now;
-
- if (large_allowed && sk_can_gso(sk)) {
- u32 gso_size, hlen;
-
- /* Maybe we should/could use sk->sk_prot->max_header here ? */
- hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
- inet_csk(sk)->icsk_ext_hdr_len +
- tp->tcp_header_len;
-
- /* Goal is to send at least one packet per ms,
- * not one big TSO packet every 100 ms.
- * This preserves ACK clocking and is consistent
- * with tcp_tso_should_defer() heuristic.
- */
- gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
- gso_size = max_t(u32, gso_size,
- sysctl_tcp_min_tso_segs * mss_now);
-
- xmit_size_goal = min_t(u32, gso_size,
- sk->sk_gso_max_size - 1 - hlen);
-
- xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
-
- /* We try hard to avoid divides here */
- old_size_goal = tp->xmit_size_goal_segs * mss_now;
-
- if (likely(old_size_goal <= xmit_size_goal &&
- old_size_goal + mss_now > xmit_size_goal)) {
- xmit_size_goal = old_size_goal;
- } else {
- tp->xmit_size_goal_segs =
- min_t(u16, xmit_size_goal / mss_now,
- sk->sk_gso_max_segs);
- xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
- }
+ u32 new_size_goal, size_goal, hlen;
+
+ if (!large_allowed || !sk_can_gso(sk))
+ return mss_now;
+
+ /* Maybe we should/could use sk->sk_prot->max_header here ? */
+ hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+ inet_csk(sk)->icsk_ext_hdr_len +
+ tp->tcp_header_len;
+
+ new_size_goal = sk->sk_gso_max_size - 1 - hlen;
+ new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+
+ /* We try hard to avoid divides here */
+ size_goal = tp->gso_segs * mss_now;
+ if (unlikely(new_size_goal < size_goal ||
+ new_size_goal >= size_goal + mss_now)) {
+ tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+ sk->sk_gso_max_segs);
+ size_goal = tp->gso_segs * mss_now;
}
- return max(xmit_size_goal, mss_now);
+ return max(size_goal, mss_now);
}
static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
@@ -1085,7 +1067,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
- struct iovec *iov;
+ const struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags, err, copied = 0;
@@ -1136,8 +1118,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
- iovlen = msg->msg_iovlen;
- iov = msg->msg_iov;
+ iovlen = msg->msg_iter.nr_segs;
+ iov = msg->msg_iter.iov;
copied = 0;
err = -EPIPE;
@@ -1349,7 +1331,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
if (len > 0) {
if (!(flags & MSG_TRUNC))
- err = memcpy_toiovec(msg->msg_iov, &c, 1);
+ err = memcpy_to_msg(msg, &c, 1);
len = 1;
} else
msg->msg_flags |= MSG_TRUNC;
@@ -1377,7 +1359,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
/* XXX -- need to support SO_PEEK_OFF */
skb_queue_walk(&sk->sk_write_queue, skb) {
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
break;
@@ -1729,7 +1711,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
tp->ucopy.task = user_recv;
- tp->ucopy.iov = msg->msg_iov;
+ tp->ucopy.msg = msg;
}
tp->ucopy.len = len;
@@ -1833,8 +1815,7 @@ do_prequeue:
}
if (!(flags & MSG_TRUNC)) {
- err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
+ err = skb_copy_datagram_msg(skb, offset, msg, used);
if (err) {
/* Exception. Bailout! */
if (!copied)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index b1c5970d47a1..27ead0dd16bc 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,5 +1,5 @@
/*
- * Plugable TCP congestion control support and newReno
+ * Pluggable TCP congestion control support and newReno
* congestion control.
* Based on ideas from I/O scheduler support and Web100.
*
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 20de0118c98e..6b6002416a73 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -363,16 +363,28 @@ static void hystart_update(struct sock *sk, u32 delay)
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- if (!(ca->found & hystart_detect)) {
+ if (ca->found & hystart_detect)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
u32 now = bictcp_clock();
/* first detection parameter - ack-train detection */
if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
ca->last_ack = now;
- if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
+ if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
}
+ }
+ if (hystart_detect & HYSTART_DELAY) {
/* obtain the minimum delay of more than sampling packets */
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
@@ -381,15 +393,16 @@ static void hystart_update(struct sock *sk, u32 delay)
ca->sample_cnt++;
} else {
if (ca->curr_rtt > ca->delay_min +
- HYSTART_DELAY_THRESH(ca->delay_min>>4))
+ HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
}
- /*
- * Either one of two conditions are met,
- * we exit from slow start immediately.
- */
- if (ca->found & hystart_detect)
- tp->snd_ssthresh = tp->snd_cwnd;
}
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d107ee246a1d..075ab4d5af5e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_max_reordering __read_mostly = 300;
EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
@@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > tp->reordering) {
int mib_idx;
- tp->reordering = min(TCP_MAX_REORDERING, metric);
+ tp->reordering = min(sysctl_tcp_max_reordering, metric);
/* This exciting event is worth to be remembered. 8) */
if (ts)
@@ -4367,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
+ if (memcpy_from_msg(skb_put(skb, size), msg, size))
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
@@ -4420,7 +4421,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
__set_current_state(TASK_RUNNING);
local_bh_enable();
- if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+ if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
@@ -4940,10 +4941,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
local_bh_enable();
if (skb_csum_unnecessary(skb))
- err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+ err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
else
- err = skb_copy_and_csum_datagram_iovec(skb, hlen,
- tp->ucopy.iov);
+ err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
if (!err) {
tp->ucopy.len -= chunk;
@@ -5030,7 +5030,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
/* step 3: check security and precedence [ignored] */
/* step 4: Check for a SYN
- * RFC 5691 4.2 : Send a challenge ack
+ * RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
syn_challenge:
@@ -5853,12 +5853,12 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
struct inet_request_sock *ireq = inet_rsk(req);
if (family == AF_INET)
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
- &ireq->ir_rmt_addr, port);
+ net_dbg_ratelimited("drop open request from %pI4/%u\n",
+ &ireq->ir_rmt_addr, port);
#if IS_ENABLED(CONFIG_IPV6)
else if (family == AF_INET6)
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
- &ireq->ir_v6_rmt_addr, port);
+ net_dbg_ratelimited("drop open request from %pI6/%u\n",
+ &ireq->ir_v6_rmt_addr, port);
#endif
}
@@ -5867,7 +5867,7 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
* If we receive a SYN packet with these bits set, it means a
* network is playing bad games with TOS bits. In order to
* avoid possible false congestion notifications, we disable
- * TCP ECN negociation.
+ * TCP ECN negotiation.
*
* Exception: tcp_ca wants ECN. This is required for DCTCP
* congestion control; it requires setting ECT on all packets,
@@ -5877,20 +5877,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
*/
static void tcp_ecn_create_request(struct request_sock *req,
const struct sk_buff *skb,
- const struct sock *listen_sk)
+ const struct sock *listen_sk,
+ const struct dst_entry *dst)
{
const struct tcphdr *th = tcp_hdr(skb);
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
- bool ect, need_ecn;
+ bool ect, need_ecn, ecn_ok;
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
need_ecn = tcp_ca_needs_ecn(listen_sk);
+ ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
- if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+ if (!ect && !need_ecn && ecn_ok)
inet_rsk(req)->ecn_ok = 1;
else if (ect && need_ecn)
inet_rsk(req)->ecn_ok = 1;
@@ -5955,13 +5957,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
- if (!want_cookie || tmp_opt.tstamp_ok)
- tcp_ecn_create_request(req, skb, sk);
-
- if (want_cookie) {
- isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
- } else if (!isn) {
+ if (!want_cookie && !isn) {
/* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering
* state TIME-WAIT, and check against it before
@@ -6009,6 +6005,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free;
}
+ tcp_ecn_create_request(req, skb, sk, dst);
+
+ if (want_cookie) {
+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
+ if (!tmp_opt.tstamp_ok)
+ inet_rsk(req)->ecn_ok = 0;
+ }
+
tcp_rsk(req)->snt_isn = isn;
tcp_openreq_init_rwin(req, sk, dst);
fastopen = !want_cookie &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 147be2024290..a3f72d7fc06c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -623,6 +623,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
+ net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
hash_location = tcp_parse_md5sig_option(th);
if (!sk && hash_location) {
@@ -633,7 +634,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
+ sk1 = __inet_lookup_listener(net,
&tcp_hashinfo, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
@@ -681,7 +682,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
- net = dev_net(skb_dst(skb)->dev);
arg.tos = ip_hdr(skb)->tos;
ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -1432,6 +1432,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
struct dst_entry *dst = sk->sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
dst->ops->check(dst, 0) == NULL) {
@@ -1453,6 +1454,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
+ sk_mark_napi_id(sk, skb);
if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk;
goto reset;
@@ -1664,7 +1666,7 @@ process:
if (sk_filter(sk, skb))
goto discard_and_relse;
- sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
skb->dev = NULL;
bh_lock_sock_nested(sk);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 1d191357bf88..272327134a1b 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -9,13 +9,13 @@
int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
/*
- * The root cgroup does not use res_counters, but rather,
+ * The root cgroup does not use page_counters, but rather,
* rely on the data already collected by the network
* subsystem
*/
- struct res_counter *res_parent = NULL;
- struct cg_proto *cg_proto, *parent_cg;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct page_counter *counter_parent = NULL;
+ struct cg_proto *cg_proto, *parent_cg;
cg_proto = tcp_prot.proto_cgroup(memcg);
if (!cg_proto)
@@ -29,9 +29,9 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
parent_cg = tcp_prot.proto_cgroup(parent);
if (parent_cg)
- res_parent = &parent_cg->memory_allocated;
+ counter_parent = &parent_cg->memory_allocated;
- res_counter_init(&cg_proto->memory_allocated, res_parent);
+ page_counter_init(&cg_proto->memory_allocated, counter_parent);
percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
return 0;
@@ -50,7 +50,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
-static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
+static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
{
struct cg_proto *cg_proto;
int i;
@@ -60,20 +60,17 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
if (!cg_proto)
return -EINVAL;
- if (val > RES_COUNTER_MAX)
- val = RES_COUNTER_MAX;
-
- ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
+ ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
if (ret)
return ret;
for (i = 0; i < 3; i++)
- cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
+ cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
sysctl_tcp_mem[i]);
- if (val == RES_COUNTER_MAX)
+ if (nr_pages == PAGE_COUNTER_MAX)
clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
- else if (val != RES_COUNTER_MAX) {
+ else {
/*
* The active bit needs to be written after the static_key
* update. This is what guarantees that the socket activation
@@ -102,11 +99,20 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
return 0;
}
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_MAX_USAGE,
+ RES_FAILCNT,
+};
+
+static DEFINE_MUTEX(tcp_limit_mutex);
+
static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long long val;
+ unsigned long nr_pages;
int ret = 0;
buf = strstrip(buf);
@@ -114,10 +120,12 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
switch (of_cft(of)->private) {
case RES_LIMIT:
/* see memcontrol.c */
- ret = res_counter_memparse_write_strategy(buf, &val);
+ ret = page_counter_memparse(buf, &nr_pages);
if (ret)
break;
- ret = tcp_update_limit(memcg, val);
+ mutex_lock(&tcp_limit_mutex);
+ ret = tcp_update_limit(memcg, nr_pages);
+ mutex_unlock(&tcp_limit_mutex);
break;
default:
ret = -EINVAL;
@@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
return ret ?: nbytes;
}
-static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
-{
- struct cg_proto *cg_proto;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return default_val;
-
- return res_counter_read_u64(&cg_proto->memory_allocated, type);
-}
-
-static u64 tcp_read_usage(struct mem_cgroup *memcg)
-{
- struct cg_proto *cg_proto;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
-
- return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
-}
-
static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
u64 val;
switch (cft->private) {
case RES_LIMIT:
- val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
+ if (!cg_proto)
+ return PAGE_COUNTER_MAX;
+ val = cg_proto->memory_allocated.limit;
+ val *= PAGE_SIZE;
break;
case RES_USAGE:
- val = tcp_read_usage(memcg);
+ if (!cg_proto)
+ val = atomic_long_read(&tcp_memory_allocated);
+ else
+ val = page_counter_read(&cg_proto->memory_allocated);
+ val *= PAGE_SIZE;
break;
case RES_FAILCNT:
+ if (!cg_proto)
+ return 0;
+ val = cg_proto->memory_allocated.failcnt;
+ break;
case RES_MAX_USAGE:
- val = tcp_read_stat(memcg, cft->private, 0);
+ if (!cg_proto)
+ return 0;
+ val = cg_proto->memory_allocated.watermark;
+ val *= PAGE_SIZE;
break;
default:
BUG();
@@ -183,10 +184,10 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
switch (of_cft(of)->private) {
case RES_MAX_USAGE:
- res_counter_reset_max(&cg_proto->memory_allocated);
+ page_counter_reset_watermark(&cg_proto->memory_allocated);
break;
case RES_FAILCNT:
- res_counter_reset_failcnt(&cg_proto->memory_allocated);
+ cg_proto->memory_allocated.failcnt = 0;
break;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 5b90f2f447a5..9d7930ba8e0f 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -94,9 +94,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
SKB_GSO_GRE_CSUM |
SKB_GSO_IPIP |
SKB_GSO_SIT |
- SKB_GSO_MPLS |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a3d453b94747..65caf8b95e17 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk);
+
+ if (!use_ecn) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ use_ecn = true;
+ }
tp->ecn_flags = 0;
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
- tcp_ca_needs_ecn(sk)) {
+
+ if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk))
@@ -1515,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
((nonagle & TCP_NAGLE_CORK) ||
(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
+
+/* Return how many segs we'd like on a TSO packet,
+ * to send one TSO packet per ms
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+{
+ u32 bytes, segs;
+
+ bytes = min(sk->sk_pacing_rate >> 10,
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+
+ return min_t(u32, segs, sk->sk_gso_max_segs);
+}
+
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
@@ -1553,7 +1583,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
- u32 in_flight, cwnd;
+ u32 in_flight, cwnd, halfcwnd;
/* Don't be strict about the congestion window for the final FIN. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
@@ -1562,10 +1592,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
in_flight = tcp_packets_in_flight(tp);
cwnd = tp->snd_cwnd;
- if (in_flight < cwnd)
- return (cwnd - in_flight);
+ if (in_flight >= cwnd)
+ return 0;
- return 0;
+ /* For better scheduling, ensure we have at least
+ * 2 GSO packets in flight.
+ */
+ halfcwnd = max(cwnd >> 1, 1U);
+ return min(halfcwnd, cwnd - in_flight);
}
/* Initialize TSO state of a skb.
@@ -1718,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
* This algorithm is from John Heffner.
*/
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
- bool *is_cwnd_limited)
+ bool *is_cwnd_limited, u32 max_segs)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1748,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
- if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
- tp->xmit_size_goal_segs * tp->mss_cache))
+ if (limit >= max_segs * tp->mss_cache)
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
@@ -1946,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int cwnd_quota;
int result;
bool is_cwnd_limited = false;
+ u32 max_segs;
sent_pkts = 0;
@@ -1959,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
}
}
+ max_segs = tcp_tso_autosize(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
@@ -1984,17 +2019,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
break;
- if (tso_segs == 1) {
+ if (tso_segs == 1 || !max_segs) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
if (!push_one &&
- tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+ max_segs))
break;
}
+ limit = mss_now;
+ if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ min_t(unsigned int,
+ cwnd_quota,
+ max_segs),
+ nonagle);
+
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ break;
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* This allows for :
@@ -2005,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
* of queued bytes to ensure line rate.
* One example is wifi aggregation (802.11 AMPDU)
*/
- limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
- sk->sk_pacing_rate >> 10);
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2019,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
- limit = mss_now;
- if (tso_segs > 1 && !tcp_urg_mode(tp))
- limit = tcp_mss_split_point(sk, skb, mss_now,
- min_t(unsigned int,
- cwnd_quota,
- sk->sk_gso_max_segs),
- nonagle);
-
- if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
- break;
-
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2998,9 +3034,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
- struct sk_buff *syn_data = NULL, *data;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
+ struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
@@ -3031,48 +3067,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
- syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
- sk->sk_allocation);
- if (syn_data == NULL)
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ if (!syn_data)
+ goto fallback;
+ syn_data->ip_summed = CHECKSUM_PARTIAL;
+ memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+ if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
+ fo->data->msg_iter.iov, 0, space))) {
+ kfree_skb(syn_data);
goto fallback;
+ }
- for (i = 0; i < iovlen && syn_data->len < space; ++i) {
- struct iovec *iov = &fo->data->msg_iov[i];
- unsigned char __user *from = iov->iov_base;
- int len = iov->iov_len;
+ /* No more data pending in inet_wait_for_connect() */
+ if (space == fo->size)
+ fo->data = NULL;
+ fo->copied = space;
- if (syn_data->len + len > space)
- len = space - syn_data->len;
- else if (i + 1 == iovlen)
- /* No more data pending in inet_wait_for_connect() */
- fo->data = NULL;
+ tcp_connect_queue_skb(sk, syn_data);
- if (skb_add_data(syn_data, from, len))
- goto fallback;
- }
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- /* Queue a data-only packet after the regular SYN for retransmission */
- data = pskb_copy(syn_data, sk->sk_allocation);
- if (data == NULL)
- goto fallback;
- TCP_SKB_CB(data)->seq++;
- TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
- TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
- tcp_connect_queue_skb(sk, data);
- fo->copied = data->len;
-
- /* syn_data is about to be sent, we need to take current time stamps
- * for the packets that are in write queue : SYN packet and DATA
- */
- skb_mstamp_get(&syn->skb_mstamp);
- data->skb_mstamp = syn->skb_mstamp;
+ syn->skb_mstamp = syn_data->skb_mstamp;
- if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+ /* Now full SYN+DATA was cloned and sent (or not),
+ * remove the SYN from the original skb (syn_data)
+ * we keep in write queue in case of a retransmit, as we
+ * also have the SYN packet (with no data) in the same queue.
+ */
+ TCP_SKB_CB(syn_data)->seq++;
+ TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+ if (!err) {
tp->syn_data = (fo->copied > 0);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
goto done;
}
- syn_data = NULL;
fallback:
/* Send a regular SYN with Fast Open cookie request option */
@@ -3081,7 +3109,6 @@ fallback:
err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
if (err)
tp->syn_fastopen = 0;
- kfree_skb(syn_data);
done:
fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
return err;
@@ -3101,13 +3128,10 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
- if (unlikely(buff == NULL))
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (unlikely(!buff))
return -ENOBUFS;
- /* Reserve space for headers. */
- skb_reserve(buff, MAX_TCP_HEADER);
-
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tp->retrans_stamp = tcp_time_stamp;
tcp_connect_queue_skb(sk, buff);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 9b21ae8b2e31..1829c7fbc77e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -374,17 +374,19 @@ void tcp_retransmit_timer(struct sock *sk)
*/
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
- &inet->inet_daddr,
- ntohs(inet->inet_dport), inet->inet_num,
- tp->snd_una, tp->snd_nxt);
+ net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+ &inet->inet_daddr,
+ ntohs(inet->inet_dport),
+ inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
- &sk->sk_v6_daddr,
- ntohs(inet->inet_dport), inet->inet_num,
- tp->snd_una, tp->snd_nxt);
+ net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+ &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport),
+ inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
}
#endif
if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd0db5471bb5..13b4dcf86ef6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -144,7 +144,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each(sk2, node, &hslot->head)
+ sk_nulls_for_each(sk2, node, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(bitmap || udp_sk(sk2)->udp_port_hash == num) &&
@@ -152,14 +152,13 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- (*saddr_comp)(sk, sk2)) {
- if (bitmap)
- __set_bit(udp_sk(sk2)->udp_port_hash >> log,
- bitmap);
- else
+ !uid_eq(uid, sock_i_uid(sk2))) &&
+ saddr_comp(sk, sk2)) {
+ if (!bitmap)
return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
}
+ }
return 0;
}
@@ -168,10 +167,10 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
* can insert/delete a socket with local_port == num
*/
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
- struct udp_hslot *hslot2,
- struct sock *sk,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2))
+ struct udp_hslot *hslot2,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2))
{
struct sock *sk2;
struct hlist_nulls_node *node;
@@ -179,7 +178,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
int res = 0;
spin_lock(&hslot2->lock);
- udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
+ udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(udp_sk(sk2)->udp_port_hash == num) &&
@@ -187,11 +186,12 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
- !uid_eq(uid, sock_i_uid(sk2))) &&
- (*saddr_comp)(sk, sk2)) {
+ !uid_eq(uid, sock_i_uid(sk2))) &&
+ saddr_comp(sk, sk2)) {
res = 1;
break;
}
+ }
spin_unlock(&hslot2->lock);
return res;
}
@@ -206,8 +206,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
* with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
- int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2),
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
@@ -336,38 +336,45 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
-static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
- unsigned short hnum,
- __be16 sport, __be32 daddr, __be16 dport, int dif)
+static inline int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, unsigned short hnum, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
{
- int score = -1;
+ int score;
+ struct inet_sock *inet;
- if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
- !ipv6_only_sock(sk)) {
- struct inet_sock *inet = inet_sk(sk);
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ ipv6_only_sock(sk))
+ return -1;
- score = (sk->sk_family == PF_INET ? 2 : 1);
- if (inet->inet_rcv_saddr) {
- if (inet->inet_rcv_saddr != daddr)
- return -1;
- score += 4;
- }
- if (inet->inet_daddr) {
- if (inet->inet_daddr != saddr)
- return -1;
- score += 4;
- }
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score += 4;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 4;
- }
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+ inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr) {
+ if (inet->inet_rcv_saddr != daddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 4;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
}
+
return score;
}
@@ -378,33 +385,39 @@ static inline int compute_score2(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif)
{
- int score = -1;
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ ipv6_only_sock(sk))
+ return -1;
- if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
- struct inet_sock *inet = inet_sk(sk);
+ inet = inet_sk(sk);
- if (inet->inet_rcv_saddr != daddr)
+ if (inet->inet_rcv_saddr != daddr ||
+ inet->inet_num != hnum)
+ return -1;
+
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
return -1;
- if (inet->inet_num != hnum)
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
return -1;
+ score += 4;
+ }
- score = (sk->sk_family == PF_INET ? 2 : 1);
- if (inet->inet_daddr) {
- if (inet->inet_daddr != saddr)
- return -1;
- score += 4;
- }
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score += 4;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 4;
- }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
}
+
return score;
}
@@ -1036,7 +1049,7 @@ back_from_confirm:
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
- skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
+ skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
msg->msg_flags);
err = PTR_ERR(skb);
@@ -1051,7 +1064,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n"));
+ net_dbg_ratelimited("cork app bug 2\n");
err = -EINVAL;
goto out;
}
@@ -1067,7 +1080,7 @@ back_from_confirm:
do_append_data:
up->len += ulen;
- err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,
+ err = ip_append_data(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
@@ -1133,7 +1146,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n"));
+ net_dbg_ratelimited("udp cork app bug 3\n");
return -EINVAL;
}
@@ -1281,12 +1294,11 @@ try_again:
}
if (skb_csum_unnecessary(skb))
- err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
- msg->msg_iov, copied);
+ err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+ msg, copied);
else {
- err = skb_copy_and_csum_datagram_iovec(skb,
- sizeof(struct udphdr),
- msg->msg_iov);
+ err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
+ msg);
if (err == -EINVAL)
goto csum_copy_err;
@@ -1445,6 +1457,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (inet_sk(sk)->inet_daddr) {
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
}
rc = sock_queue_rcv_skb(sk, skb);
@@ -1546,8 +1559,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* provided by the application."
*/
if (up->pcrlen == 0) { /* full coverage was set */
- LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n",
- UDP_SKB_CB(skb)->cscov, skb->len);
+ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
/* The next case involves violating the min. coverage requested
@@ -1557,8 +1570,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* Therefore the above ...()->partial_cov statement is essential.
*/
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
- LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, up->pcrlen);
goto drop;
}
}
@@ -1647,7 +1660,8 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udphdr *uh,
__be32 saddr, __be32 daddr,
- struct udp_table *udptable)
+ struct udp_table *udptable,
+ int proto)
{
struct sock *sk, *stack[256 / sizeof(struct sock *)];
struct hlist_nulls_node *node;
@@ -1656,6 +1670,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
int dif = skb->dev->ifindex;
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+ bool inner_flushed = false;
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
@@ -1674,6 +1689,7 @@ start_lookup:
dif, hnum)) {
if (unlikely(count == ARRAY_SIZE(stack))) {
flush_stack(stack, count, skb, ~0);
+ inner_flushed = true;
count = 0;
}
stack[count++] = sk;
@@ -1695,7 +1711,10 @@ start_lookup:
if (count) {
flush_stack(stack, count, skb, count - 1);
} else {
- kfree_skb(skb);
+ if (!inner_flushed)
+ UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
+ consume_skb(skb);
}
return 0;
}
@@ -1777,14 +1796,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (ret > 0)
return -ret;
return 0;
- } else {
- if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(net, skb, uh,
- saddr, daddr, udptable);
-
- sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
}
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable, proto);
+
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk != NULL) {
int ret;
@@ -1822,11 +1840,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
return 0;
short_packet:
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
- proto == IPPROTO_UDPLITE ? "Lite" : "",
- &saddr, ntohs(uh->source),
- ulen, skb->len,
- &daddr, ntohs(uh->dest));
+ net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source),
+ ulen, skb->len,
+ &daddr, ntohs(uh->dest));
goto drop;
csum_error:
@@ -1834,10 +1852,10 @@ csum_error:
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
- LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
- proto == IPPROTO_UDPLITE ? "Lite" : "",
- &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
- ulen);
+ net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
+ ulen);
UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
@@ -2027,7 +2045,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
} else {
up->corkflag = 0;
lock_sock(sk);
- (*push_pending_frames)(sk);
+ push_pending_frames(sk);
release_sock(sk);
}
break;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6480cea7aa53..d3e537ef6b7f 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -29,7 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
netdev_features_t features),
- __be16 new_protocol)
+ __be16 new_protocol, bool is_ipv6)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
@@ -39,7 +39,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t enc_features;
int udp_offset, outer_hlen;
unsigned int oldlen;
- bool need_csum;
+ bool need_csum = !!(skb_shinfo(skb)->gso_type &
+ SKB_GSO_UDP_TUNNEL_CSUM);
+ bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ bool offload_csum = false, dont_encap = (need_csum || remcsum);
oldlen = (u16)~skb->len;
@@ -52,10 +55,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
skb_set_network_header(skb, skb_inner_network_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = new_protocol;
+ skb->encap_hdr_csum = need_csum;
+ skb->remcsum_offload = remcsum;
- need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
- if (need_csum)
- skb->encap_hdr_csum = 1;
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ (skb->dev->features &
+ (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM)));
/* segment inner packet. */
enc_features = skb->dev->hw_enc_features & features;
@@ -72,11 +78,21 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
do {
struct udphdr *uh;
int len;
-
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
+ __be32 delta;
+
+ if (dont_encap) {
+ skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* Only set up inner headers if we might be offloading
+ * inner checksum.
+ */
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
skb->mac_len = mac_len;
+ skb->protocol = protocol;
skb_push(skb, outer_hlen);
skb_reset_mac_header(skb);
@@ -86,19 +102,36 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
uh = udp_hdr(skb);
uh->len = htons(len);
- if (need_csum) {
- __be32 delta = htonl(oldlen + len);
+ if (!need_csum)
+ continue;
- uh->check = ~csum_fold((__force __wsum)
- ((__force u32)uh->check +
- (__force u32)delta));
+ delta = htonl(oldlen + len);
+
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ if (offload_csum) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ } else if (remcsum) {
+ /* Need to calculate checksum from scratch,
+ * inner checksums are never when doing
+ * remote_checksum_offload.
+ */
+
+ skb->csum = skb_checksum(skb, udp_offset,
+ skb->len - udp_offset,
+ 0);
+ uh->check = csum_fold(skb->csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
uh->check = gso_make_checksum(skb, ~uh->check);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
}
-
- skb->protocol = protocol;
} while ((skb = skb->next));
out:
return segs;
@@ -134,7 +167,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
}
segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
- protocol);
+ protocol, is_ipv6);
out_unlock:
rcu_read_unlock();
@@ -172,9 +205,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
SKB_GSO_IPIP |
- SKB_GSO_GRE | SKB_GSO_GRE_CSUM |
- SKB_GSO_MPLS) ||
+ SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
!(type & (SKB_GSO_UDP))))
goto out;