aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/ip_fragment.c31
-rw-r--r--net/ipv4/ip_output.c29
2 files changed, 47 insertions, 13 deletions
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 47fa64ee82b1..a50dc6d408d1 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -75,6 +75,7 @@ struct ipq {
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */
+ u16 max_df_size; /* largest frag with DF set seen */
int iif;
unsigned int rid;
struct inet_peer *peer;
@@ -326,6 +327,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
+ unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
@@ -481,9 +483,14 @@ found:
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
+ fragsize = skb->len + ihl;
+
+ if (fragsize > qp->q.max_size)
+ qp->q.max_size = fragsize;
+
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len + ihl > qp->q.max_size)
- qp->q.max_size = skb->len + ihl;
+ fragsize > qp->max_df_size)
+ qp->max_df_size = fragsize;
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
@@ -613,13 +620,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = qp->q.max_size;
+ IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
iph = ip_hdr(head);
- /* max_size != 0 implies at least one fragment had IP_DF set */
- iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
+
+ /* When we set IP_DF on a refragmented skb we must also force a
+ * call to ip_fragment to avoid forwarding a DF-skb of size s while
+ * original sender only sent fragments of size f (where f < s).
+ *
+ * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+ * frag seen to avoid sending tiny DF-fragments in case skb was built
+ * from one very small df-fragment and one large non-df frag.
+ */
+ if (qp->max_df_size == qp->q.max_size) {
+ IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ iph->frag_off = htons(IP_DF);
+ } else {
+ iph->frag_off = 0;
+ }
+
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 451b009dae75..f5f5ef1cebd5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -84,6 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
int (*output)(struct sock *, struct sk_buff *));
/* Generate a checksum for an outgoing IP datagram. */
@@ -219,7 +220,8 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -227,7 +229,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
- skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
+ skb_gso_network_seglen(skb) <= mtu)
return ip_finish_output2(sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
@@ -251,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, ip_finish_output2);
+ err = ip_fragment(sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -263,6 +265,8 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
@@ -270,11 +274,12 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
return dst_output_sk(sk, skb);
}
#endif
+ mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb);
+ return ip_finish_output_gso(sk, skb, mtu);
- if (skb->len > ip_skb_dst_mtu(skb))
- return ip_fragment(sk, skb, ip_finish_output2);
+ if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ return ip_fragment(sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(sk, skb);
}
@@ -482,12 +487,15 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
}
static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
int (*output)(struct sock *, struct sk_buff *))
{
struct iphdr *iph = ip_hdr(skb);
- unsigned int mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ if ((iph->frag_off & htons(IP_DF)) == 0)
+ return ip_do_fragment(sk, skb, output);
+
+ if (unlikely(!skb->ignore_df ||
(IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > mtu))) {
struct rtable *rt = skb_rtable(skb);
@@ -532,6 +540,8 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
+ if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+ mtu = IPCB(skb)->frag_max_size;
/*
* Setup starting values.
@@ -727,6 +737,9 @@ slow_path:
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ iph->frag_off |= htons(IP_DF);
+
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE