From 75e8d06d4308436055d1a78a2c02bf6328ba724d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 14 Jan 2015 15:33:57 +0100
Subject: netfilter: nf_tables: validate hooks in NAT expressions

The user can crash the kernel if it uses any of the existing NAT
expressions from the wrong hook, so add some code to validate this
when loading the rule.

This patch introduces nft_chain_validate_hooks() which is based on
an existing function in the bridge version of the reject expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nft_reject_bridge.c | 29 +++++------------------
 net/netfilter/nf_tables_api.c            | 18 ++++++++++++++
 net/netfilter/nft_masq.c                 | 26 ++++++++++++++-------
 net/netfilter/nft_nat.c                  | 40 ++++++++++++++++++++++++--------
 net/netfilter/nft_redir.c                | 25 +++++++++++++-------
 5 files changed, 88 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index b0330aecbf97..3244aead0926 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -265,22 +265,12 @@ out:
 	data[NFT_REG_VERDICT].verdict = NF_DROP;
 }
 
-static int nft_reject_bridge_validate_hooks(const struct nft_chain *chain)
+static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
+				      const struct nft_expr *expr,
+				      const struct nft_data **data)
 {
-	struct nft_base_chain *basechain;
-
-	if (chain->flags & NFT_BASE_CHAIN) {
-		basechain = nft_base_chain(chain);
-
-		switch (basechain->ops[0].hooknum) {
-		case NF_BR_PRE_ROUTING:
-		case NF_BR_LOCAL_IN:
-			break;
-		default:
-			return -EOPNOTSUPP;
-		}
-	}
-	return 0;
+	return nft_chain_validate_hooks(ctx->chain, (1 << NF_BR_PRE_ROUTING) |
+						    (1 << NF_BR_LOCAL_IN));
 }
 
 static int nft_reject_bridge_init(const struct nft_ctx *ctx,
@@ -290,7 +280,7 @@ static int nft_reject_bridge_init(const struct nft_ctx *ctx,
 	struct nft_reject *priv = nft_expr_priv(expr);
 	int icmp_code, err;
 
-	err = nft_reject_bridge_validate_hooks(ctx->chain);
+	err = nft_reject_bridge_validate(ctx, expr, NULL);
 	if (err < 0)
 		return err;
 
@@ -341,13 +331,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
-				      const struct nft_expr *expr,
-				      const struct nft_data **data)
-{
-	return nft_reject_bridge_validate_hooks(ctx->chain);
-}
-
 static struct nft_expr_type nft_reject_bridge_type;
 static const struct nft_expr_ops nft_reject_bridge_ops = {
 	.type		= &nft_reject_bridge_type,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3b3ddb4fb9ee..7e686948ddca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3753,6 +3753,24 @@ int nft_chain_validate_dependency(const struct nft_chain *chain,
 }
 EXPORT_SYMBOL_GPL(nft_chain_validate_dependency);
 
+int nft_chain_validate_hooks(const struct nft_chain *chain,
+			     unsigned int hook_flags)
+{
+	struct nft_base_chain *basechain;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		basechain = nft_base_chain(chain);
+
+		if ((1 << basechain->ops[0].hooknum) & hook_flags)
+			return 0;
+
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
+
 /*
  * Loop detection - walk through the ruleset beginning at the destination chain
  * of a new jump until either the source chain is reached (loop) or all
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index d1ffd5eb3a9b..9aea747b43ea 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -21,6 +21,21 @@ const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
 };
 EXPORT_SYMBOL_GPL(nft_masq_policy);
 
+int nft_masq_validate(const struct nft_ctx *ctx,
+		      const struct nft_expr *expr,
+		      const struct nft_data **data)
+{
+	int err;
+
+	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+	if (err < 0)
+		return err;
+
+	return nft_chain_validate_hooks(ctx->chain,
+				        (1 << NF_INET_POST_ROUTING));
+}
+EXPORT_SYMBOL_GPL(nft_masq_validate);
+
 int nft_masq_init(const struct nft_ctx *ctx,
 		  const struct nft_expr *expr,
 		  const struct nlattr * const tb[])
@@ -28,8 +43,8 @@ int nft_masq_init(const struct nft_ctx *ctx,
 	struct nft_masq *priv = nft_expr_priv(expr);
 	int err;
 
-	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
-	if (err < 0)
+	err = nft_masq_validate(ctx, expr, NULL);
+	if (err)
 		return err;
 
 	if (tb[NFTA_MASQ_FLAGS] == NULL)
@@ -60,12 +75,5 @@ nla_put_failure:
 }
 EXPORT_SYMBOL_GPL(nft_masq_dump);
 
-int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
-		      const struct nft_data **data)
-{
-	return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
-}
-EXPORT_SYMBOL_GPL(nft_masq_validate);
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index aff54fb1c8a0..a0837c6c9283 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -88,17 +88,40 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
 	[NFTA_NAT_FLAGS]	 = { .type = NLA_U32 },
 };
 
-static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
+static int nft_nat_validate(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nft_data **data)
 {
 	struct nft_nat *priv = nft_expr_priv(expr);
-	u32 family;
 	int err;
 
 	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
 	if (err < 0)
 		return err;
 
+	switch (priv->type) {
+	case NFT_NAT_SNAT:
+		err = nft_chain_validate_hooks(ctx->chain,
+					       (1 << NF_INET_POST_ROUTING) |
+					       (1 << NF_INET_LOCAL_IN));
+		break;
+	case NFT_NAT_DNAT:
+		err = nft_chain_validate_hooks(ctx->chain,
+					       (1 << NF_INET_PRE_ROUTING) |
+					       (1 << NF_INET_LOCAL_OUT));
+		break;
+	}
+
+	return err;
+}
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	u32 family;
+	int err;
+
 	if (tb[NFTA_NAT_TYPE] == NULL ||
 	    (tb[NFTA_NAT_REG_ADDR_MIN] == NULL &&
 	     tb[NFTA_NAT_REG_PROTO_MIN] == NULL))
@@ -115,6 +138,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		return -EINVAL;
 	}
 
+	err = nft_nat_validate(ctx, expr, NULL);
+	if (err < 0)
+		return err;
+
 	if (tb[NFTA_NAT_FAMILY] == NULL)
 		return -EINVAL;
 
@@ -219,13 +246,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_nat_validate(const struct nft_ctx *ctx,
-			    const struct nft_expr *expr,
-			    const struct nft_data **data)
-{
-	return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
-}
-
 static struct nft_expr_type nft_nat_type;
 static const struct nft_expr_ops nft_nat_ops = {
 	.type           = &nft_nat_type,
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 9e8093f28311..d7e9e93a4e90 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -23,6 +23,22 @@ const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = {
 };
 EXPORT_SYMBOL_GPL(nft_redir_policy);
 
+int nft_redir_validate(const struct nft_ctx *ctx,
+		       const struct nft_expr *expr,
+		       const struct nft_data **data)
+{
+	int err;
+
+	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+	if (err < 0)
+		return err;
+
+	return nft_chain_validate_hooks(ctx->chain,
+					(1 << NF_INET_PRE_ROUTING) |
+					(1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_redir_validate);
+
 int nft_redir_init(const struct nft_ctx *ctx,
 		   const struct nft_expr *expr,
 		   const struct nlattr * const tb[])
@@ -30,7 +46,7 @@ int nft_redir_init(const struct nft_ctx *ctx,
 	struct nft_redir *priv = nft_expr_priv(expr);
 	int err;
 
-	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+	err = nft_redir_validate(ctx, expr, NULL);
 	if (err < 0)
 		return err;
 
@@ -88,12 +104,5 @@ nla_put_failure:
 }
 EXPORT_SYMBOL_GPL(nft_redir_dump);
 
-int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
-		       const struct nft_data **data)
-{
-	return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
-}
-EXPORT_SYMBOL_GPL(nft_redir_validate);
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
-- 
cgit v1.2.3-59-g8ed1b


From e8781f70a5b210a1b08cff8ce05895ebcec18d83 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 21 Jan 2015 18:04:18 +0100
Subject: netfilter: nf_tables: disable preemption when restoring chain
 counters

With CONFIG_DEBUG_PREEMPT=y

[22144.496057] BUG: using smp_processor_id() in preemptible [00000000] code: iptables-compat/10406
[22144.496061] caller is debug_smp_processor_id+0x17/0x1b
[22144.496065] CPU: 2 PID: 10406 Comm: iptables-compat Not tainted 3.19.0-rc4+ #
[...]
[22144.496092] Call Trace:
[22144.496098]  [<ffffffff8145b9fa>] dump_stack+0x4f/0x7b
[22144.496104]  [<ffffffff81244f52>] check_preemption_disabled+0xd6/0xe8
[22144.496110]  [<ffffffff81244f90>] debug_smp_processor_id+0x17/0x1b
[22144.496120]  [<ffffffffa07c557e>] nft_stats_alloc+0x94/0xc7 [nf_tables]
[22144.496130]  [<ffffffffa07c73d2>] nf_tables_newchain+0x471/0x6d8 [nf_tables]
[22144.496140]  [<ffffffffa07c5ef6>] ? nft_trans_alloc+0x18/0x34 [nf_tables]
[22144.496154]  [<ffffffffa063c8da>] nfnetlink_rcv_batch+0x2b4/0x457 [nfnetlink]

Reported-by: Andreas Schultz <aschultz@tpip.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7e686948ddca..b54360634e95 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1134,9 +1134,11 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
 	/* Restore old counters on this cpu, no problem. Per-cpu statistics
 	 * are not exposed to userspace.
 	 */
+	preempt_disable();
 	stats = this_cpu_ptr(newstats);
 	stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
 	stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+	preempt_enable();
 
 	return newstats;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e73ebb0881ea5534ce606c1d71b4ac44db5c6930 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:35 -0500
Subject: tcp: stretch ACK fixes prep

LRO, GRO, delayed ACKs, and middleboxes can cause "stretch ACKs" that
cover more than the RFC-specified maximum of 2 packets. These stretch
ACKs can cause serious performance shortfalls in common congestion
control algorithms that were designed and tuned years ago with
receiver hosts that were not using LRO or GRO, and were instead
politely ACKing every other packet.

This patch series fixes Reno and CUBIC to handle stretch ACKs.

This patch prepares for the upcoming stretch ACK bug fix patches. It
adds an "acked" parameter to tcp_cong_avoid_ai() to allow for future
fixes to tcp_cong_avoid_ai() to correctly handle stretch ACKs, and
changes all congestion control algorithms to pass in 1 for the ACKed
count. It also changes tcp_slow_start() to return the number of packet
ACK "credits" that were not processed in slow start mode, and can be
processed by the congestion control module in additive increase mode.

In future patches we will fix tcp_cong_avoid_ai() to handle stretch
ACKs, and fix Reno and CUBIC handling of stretch ACKs in slow start
and additive increase mode.

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  4 ++--
 net/ipv4/tcp_bic.c      |  2 +-
 net/ipv4/tcp_cong.c     | 11 +++++++----
 net/ipv4/tcp_cubic.c    |  2 +-
 net/ipv4/tcp_scalable.c |  3 ++-
 net/ipv4/tcp_veno.c     |  2 +-
 net/ipv4/tcp_yeah.c     |  2 +-
 7 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf76f..9d9111ef43ae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,8 +834,8 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
 int tcp_set_congestion_control(struct sock *sk, const char *name);
-void tcp_slow_start(struct tcp_sock *tp, u32 acked);
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
 u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index bb395d46a389..c037644eafb7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		tcp_slow_start(tp, acked);
 	else {
 		bictcp_update(ca, tp->snd_cwnd);
-		tcp_cong_avoid_ai(tp, ca->cnt);
+		tcp_cong_avoid_ai(tp, ca->cnt, 1);
 	}
 }
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..6826017c12d1 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,25 +291,28 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
  * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
  * returns the leftover acks to adjust cwnd in congestion avoidance mode.
  */
-void tcp_slow_start(struct tcp_sock *tp, u32 acked)
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
 {
 	u32 cwnd = tp->snd_cwnd + acked;
 
 	if (cwnd > tp->snd_ssthresh)
 		cwnd = tp->snd_ssthresh + 1;
+	acked -= cwnd - tp->snd_cwnd;
 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+	return acked;
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
 
 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
 {
 	if (tp->snd_cwnd_cnt >= w) {
 		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 			tp->snd_cwnd++;
 		tp->snd_cwnd_cnt = 0;
 	} else {
-		tp->snd_cwnd_cnt++;
+		tp->snd_cwnd_cnt += acked;
 	}
 }
 EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
@@ -333,7 +336,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		tcp_slow_start(tp, acked);
 	/* In dangerous area, increase slowly. */
 	else
-		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 6b6002416a73..df4bc4d87e58 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		tcp_slow_start(tp, acked);
 	} else {
 		bictcp_update(ca, tp->snd_cwnd);
-		tcp_cong_avoid_ai(tp, ca->cnt);
+		tcp_cong_avoid_ai(tp, ca->cnt, 1);
 	}
 }
 
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6824afb65d93..333bcb2415ff 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
 		tcp_slow_start(tp, acked);
 	else
-		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
+		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+				  1);
 }
 
 static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index a4d2d2d88dca..112151eeee45 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 				/* In the "non-congestive state", increase cwnd
 				 *  every rtt.
 				 */
-				tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+				tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
 			} else {
 				/* In the "congestive state", increase cwnd
 				 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index cd7273218598..17d35662930d 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 
 	} else {
 		/* Reno */
-		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
 	}
 
 	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
-- 
cgit v1.2.3-59-g8ed1b


From 814d488c61260521b1b3cc97063700a5a6667c8f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:36 -0500
Subject: tcp: fix the timid additive increase on stretch ACKs

tcp_cong_avoid_ai() was too timid (snd_cwnd increased too slowly) on
"stretch ACKs" -- cases where the receiver ACKed more than 1 packet in
a single ACK. For example, suppose w is 10 and we get a stretch ACK
for 20 packets, so acked is 20. We ought to increase snd_cwnd by 2
(since acked/w = 20/10 = 2), but instead we were only increasing cwnd
by 1. This patch fixes that behavior.

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cong.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6826017c12d1..faaee5338bea 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -304,16 +304,19 @@ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
 
-/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
+ * for every packet that was ACKed.
+ */
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
 {
+	tp->snd_cwnd_cnt += acked;
 	if (tp->snd_cwnd_cnt >= w) {
-		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-			tp->snd_cwnd++;
-		tp->snd_cwnd_cnt = 0;
-	} else {
-		tp->snd_cwnd_cnt += acked;
+		u32 delta = tp->snd_cwnd_cnt / w;
+
+		tp->snd_cwnd_cnt -= delta * w;
+		tp->snd_cwnd += delta;
 	}
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
 }
 EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
 
-- 
cgit v1.2.3-59-g8ed1b


From c22bdca94782f05b9337d8548bde51b2f38ef17f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:37 -0500
Subject: tcp: fix stretch ACK bugs in Reno

Change Reno to properly handle stretch ACKs in additive increase mode
by passing in the count of ACKed packets to tcp_cong_avoid_ai().

In addition, if snd_cwnd crosses snd_ssthresh during slow start
processing, and we then exit slow start mode, we need to carry over
any remaining "credit" for packets ACKed and apply that to additive
increase by passing this remaining "acked" count to
tcp_cong_avoid_ai().

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cong.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index faaee5338bea..8670e68e2ce6 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -335,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		return;
 
 	/* In "safe" area, increase. */
-	if (tp->snd_cwnd <= tp->snd_ssthresh)
-		tcp_slow_start(tp, acked);
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
 	/* In dangerous area, increase slowly. */
-	else
-		tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+	tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9cd981dcf174d26805a032aefa791436da709bee Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:38 -0500
Subject: tcp: fix stretch ACK bugs in CUBIC

Change CUBIC to properly handle stretch ACKs in additive increase mode
by passing in the count of ACKed packets to tcp_cong_avoid_ai().

In addition, because we are now precisely accounting for stretch ACKs,
including delayed ACKs, we can now remove the delayed ACK tracking and
estimation code that tracked recent delayed ACK behavior in
ca->delayed_ack.

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cubic.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index df4bc4d87e58..ffc045da2fd5 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -93,9 +93,7 @@ struct bictcp {
 	u32	epoch_start;	/* beginning of an epoch */
 	u32	ack_cnt;	/* number of acks */
 	u32	tcp_cwnd;	/* estimated tcp cwnd */
-#define ACK_RATIO_SHIFT	4
-#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
-	u16	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+	u16	unused;
 	u8	sample_cnt;	/* number of samples to decide curr_rtt */
 	u8	found;		/* the exit point is found? */
 	u32	round_start;	/* beginning of each round */
@@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca)
 	ca->bic_K = 0;
 	ca->delay_min = 0;
 	ca->epoch_start = 0;
-	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
 	ca->ack_cnt = 0;
 	ca->tcp_cwnd = 0;
 	ca->found = 0;
@@ -205,12 +202,12 @@ static u32 cubic_root(u64 a)
 /*
  * Compute congestion window to use.
  */
-static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
 {
 	u32 delta, bic_target, max_cnt;
 	u64 offs, t;
 
-	ca->ack_cnt++;	/* count the number of ACKs */
+	ca->ack_cnt += acked;	/* count the number of ACKed packets */
 
 	if (ca->last_cwnd == cwnd &&
 	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
@@ -221,7 +218,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 
 	if (ca->epoch_start == 0) {
 		ca->epoch_start = tcp_time_stamp;	/* record beginning */
-		ca->ack_cnt = 1;			/* start counting */
+		ca->ack_cnt = acked;			/* start counting */
 		ca->tcp_cwnd = cwnd;			/* syn with cubic */
 
 		if (ca->last_max_cwnd <= cwnd) {
@@ -301,7 +298,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 		}
 	}
 
-	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
 	if (ca->cnt == 0)			/* cannot be zero */
 		ca->cnt = 1;
 }
@@ -317,11 +313,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (tp->snd_cwnd <= tp->snd_ssthresh) {
 		if (hystart && after(ack, ca->end_seq))
 			bictcp_hystart_reset(sk);
-		tcp_slow_start(tp, acked);
-	} else {
-		bictcp_update(ca, tp->snd_cwnd);
-		tcp_cong_avoid_ai(tp, ca->cnt, 1);
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
 	}
+	bictcp_update(ca, tp->snd_cwnd, acked);
+	tcp_cong_avoid_ai(tp, ca->cnt, acked);
 }
 
 static u32 bictcp_recalc_ssthresh(struct sock *sk)
@@ -411,20 +408,10 @@ static void hystart_update(struct sock *sk, u32 delay)
  */
 static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
 {
-	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
 	u32 delay;
 
-	if (icsk->icsk_ca_state == TCP_CA_Open) {
-		u32 ratio = ca->delayed_ack;
-
-		ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
-		ratio += cnt;
-
-		ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
-	}
-
 	/* Some calls are for duplicates without timetamps */
 	if (rtt_us < 0)
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From d6b1a8a92a1417f8859a6937d2e6ffe2dfab4e6d Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:39 -0500
Subject: tcp: fix timing issue in CUBIC slope calculation

This patch fixes a bug in CUBIC that causes cwnd to increase slightly
too slowly when multiple ACKs arrive in the same jiffy.

If cwnd is supposed to increase at a rate of more than once per jiffy,
then CUBIC was sometimes too slow. Because the bic_target is
calculated for a future point in time, calculated with time in
jiffies, the cwnd can increase over the course of the jiffy while the
bic_target calculated as the proper CUBIC cwnd at time
t=tcp_time_stamp+rtt does not increase, because tcp_time_stamp only
increases on jiffy tick boundaries.

So since the cnt is set to:
	ca->cnt = cwnd / (bic_target - cwnd);
as cwnd increases but bic_target does not increase due to jiffy
granularity, the cnt becomes too large, causing cwnd to increase
too slowly.

For example:
- suppose at the beginning of a jiffy, cwnd=40, bic_target=44
- so CUBIC sets:
   ca->cnt =  cwnd / (bic_target - cwnd) = 40 / (44 - 40) = 40/4 = 10
- suppose we get 10 acks, each for 1 segment, so tcp_cong_avoid_ai()
   increases cwnd to 41
- so CUBIC sets:
   ca->cnt =  cwnd / (bic_target - cwnd) = 41 / (44 - 41) = 41 / 3 = 13

So now CUBIC will wait for 13 packets to be ACKed before increasing
cwnd to 42, insted of 10 as it should.

The fix is to avoid adjusting the slope (determined by ca->cnt)
multiple times within a jiffy, and instead skip to compute the Reno
cwnd, the "TCP friendliness" code path.

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cubic.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ffc045da2fd5..4b276d1ed980 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -213,6 +213,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
 	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
 		return;
 
+	/* The CUBIC function can update ca->cnt at most once per jiffy.
+	 * On all cwnd reduction events, ca->epoch_start is set to 0,
+	 * which will force a recalculation of ca->cnt.
+	 */
+	if (ca->epoch_start && tcp_time_stamp == ca->last_time)
+		goto tcp_friendliness;
+
 	ca->last_cwnd = cwnd;
 	ca->last_time = tcp_time_stamp;
 
@@ -280,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
 	if (ca->last_max_cwnd == 0 && ca->cnt > 20)
 		ca->cnt = 20;	/* increase cwnd 5% per RTT */
 
+tcp_friendliness:
 	/* TCP Friendly */
 	if (tcp_friendliness) {
 		u32 scale = beta_scale;
-- 
cgit v1.2.3-59-g8ed1b


From 59ccaaaa49b5b096cdc1f16706a9f931416b2332 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 28 Jan 2015 16:23:11 -0800
Subject: bridge: dont send notification when skb->len == 0 in
 rtnl_bridge_notify

Reported in: https://bugzilla.kernel.org/show_bug.cgi?id=92081

This patch avoids calling rtnl_notify if the device ndo_bridge_getlink
handler does not return any bytes in the skb.

Alternately, the skb->len check can be moved inside rtnl_notify.

For the bridge vlan case described in 92081, there is also a fix needed
in bridge driver to generate a proper notification. Will fix that in
subsequent patch.

v2: rebase patch on net tree

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9cf6fe9ddc0c..446cbaf81185 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2895,12 +2895,16 @@ static int rtnl_bridge_notify(struct net_device *dev, u16 flags)
 			goto errout;
 	}
 
+	if (!skb->len)
+		goto errout;
+
 	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
 	return 0;
 errout:
 	WARN_ON(err == -EMSGSIZE);
 	kfree_skb(skb);
-	rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+	if (err)
+		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 811230cd853d62f09ed0addd0ce9a1b9b0e13fb5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Jan 2015 05:47:11 -0800
Subject: tcp: ipv4: initialize unicast_sock sk_pacing_rate

When I added sk_pacing_rate field, I forgot to initialize its value
in the per cpu unicast_sock used in ip_send_unicast_reply()

This means that for sch_fq users, RST packets, or ACK packets sent
on behalf of TIME_WAIT sockets might be sent to slowly or even dropped
once we reach the per flow limit.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 95bd09eb2750 ("tcp: TSO packets automatic sizing")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b50861b22b6b..38a20a9cca1a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1517,6 +1517,7 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
 		.sk_wmem_alloc	= ATOMIC_INIT(1),
 		.sk_allocation	= GFP_ATOMIC,
 		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
+		.sk_pacing_rate = ~0U,
 	},
 	.pmtudisc	= IP_PMTUDISC_WANT,
 	.uc_ttl		= -1,
-- 
cgit v1.2.3-59-g8ed1b


From 8997c27ec41127bf57421cc0205413d525421ddc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 26 Jan 2015 22:28:13 +0100
Subject: caif: remove wrong dev_net_set() call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

src_net points to the netns where the netlink message has been received. This
netns may be different from the netns where the interface is created (because
the user may add IFLA_NET_NS_[PID|FD]). In this case, src_net is the link netns.

It seems wrong to override the netns in the newlink() handler because if it
was not already src_net, it means that the user explicitly asks to create the
netdevice in another netns.

CC: Sjur Brændeland <sjur.brandeland@stericsson.com>
CC: Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
Fixes: 8391c4aab1aa ("caif: Bugfixes in CAIF netdevice for close and flow control")
Fixes: c41254006377 ("caif-hsi: Add rtnl support")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/caif/caif_hsi.c | 1 -
 net/caif/chnl_net.c         | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index 5e40a8b68cbe..b3b922adc0e4 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -1415,7 +1415,6 @@ static int caif_hsi_newlink(struct net *src_net, struct net_device *dev,
 
 	cfhsi = netdev_priv(dev);
 	cfhsi_netlink_parms(data, cfhsi);
-	dev_net_set(cfhsi->ndev, src_net);
 
 	get_ops = symbol_get(cfhsi_get_ops);
 	if (!get_ops) {
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 4589ff67bfa9..67a4a36febd1 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -470,7 +470,6 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
 	ASSERT_RTNL();
 	caifdev = netdev_priv(dev);
 	caif_netlink_parms(data, &caifdev->conn_req);
-	dev_net_set(caifdev->netdev, src_net);
 
 	ret = register_netdevice(dev);
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From 3cdaa5be9e81a914e633a6be7b7d2ef75b528562 Mon Sep 17 00:00:00 2001
From: Li Wei <lw@cn.fujitsu.com>
Date: Thu, 29 Jan 2015 16:09:03 +0800
Subject: ipv4: Don't increase PMTU with Datagram Too Big message.

RFC 1191 said, "a host MUST not increase its estimate of the Path
MTU in response to the contents of a Datagram Too Big message."

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d58dd0ec3e53..52e1f2bf0ca2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (dst->dev->mtu < mtu)
 		return;
 
+	if (rt->rt_pmtu && rt->rt_pmtu < mtu)
+		return;
+
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
-- 
cgit v1.2.3-59-g8ed1b


From 579eb62ac35845686a7c4286c0a820b4eb1f96aa Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 18 Dec 2014 22:41:23 +0200
Subject: ipvs: rerouting to local clients is not needed anymore

commit f5a41847acc5 ("ipvs: move ip_route_me_harder for ICMP")
from 2.6.37 introduced ip_route_me_harder() call for responses to
local clients, so that we can provide valid rt_src after SNAT.
It was used by TCP to provide valid daddr for ip_send_reply().
After commit 0a5ebb8000c5 ("ipv4: Pass explicit daddr arg to
ip_send_reply()." from 3.0 this rerouting is not needed anymore
and should be avoided, especially in LOCAL_IN.

Fixes 3.12.33 crash in xfrm reported by Florian Wiessner:
"3.12.33 - BUG xfrm_selector_match+0x25/0x2f6"

Reported-by: Smart Weblications GmbH - Florian Wiessner <f.wiessner@smart-weblications.de>
Tested-by: Smart Weblications GmbH - Florian Wiessner <f.wiessner@smart-weblications.de>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 990decba1fe4..b87ca32efa0b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -659,16 +659,24 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 	return err;
 }
 
-static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
+				 unsigned int hooknum)
 {
+	if (!sysctl_snat_reroute(skb))
+		return 0;
+	/* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
+	if (NF_INET_LOCAL_IN == hooknum)
+		return 0;
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
+		struct dst_entry *dst = skb_dst(skb);
+
+		if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
+		    ip6_route_me_harder(skb) != 0)
 			return 1;
 	} else
 #endif
-		if ((sysctl_snat_reroute(skb) ||
-		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			return 1;
 
@@ -791,7 +799,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 				union nf_inet_addr *snet,
 				__u8 protocol, struct ip_vs_conn *cp,
 				struct ip_vs_protocol *pp,
-				unsigned int offset, unsigned int ihl)
+				unsigned int offset, unsigned int ihl,
+				unsigned int hooknum)
 {
 	unsigned int verdict = NF_DROP;
 
@@ -821,7 +830,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 #endif
 		ip_vs_nat_icmp(skb, pp, cp, 1);
 
-	if (ip_vs_route_me_harder(af, skb))
+	if (ip_vs_route_me_harder(af, skb, hooknum))
 		goto out;
 
 	/* do the statistics and put it back */
@@ -916,7 +925,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
 
 	snet.ip = iph->saddr;
 	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
-				    pp, ciph.len, ihl);
+				    pp, ciph.len, ihl, hooknum);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -981,7 +990,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
 	snet.in6 = ciph.saddr.in6;
 	writable = ciph.len;
 	return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
-				    pp, writable, sizeof(struct ipv6hdr));
+				    pp, writable, sizeof(struct ipv6hdr),
+				    hooknum);
 }
 #endif
 
@@ -1040,7 +1050,8 @@ static inline bool is_new_conn(const struct sk_buff *skb,
  */
 static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
-		struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+		struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
+		unsigned int hooknum)
 {
 	struct ip_vs_protocol *pp = pd->pp;
 
@@ -1078,7 +1089,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 	 * if it came from this machine itself.  So re-compute
 	 * the routing information.
 	 */
-	if (ip_vs_route_me_harder(af, skb))
+	if (ip_vs_route_me_harder(af, skb, hooknum))
 		goto drop;
 
 	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
@@ -1181,7 +1192,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	cp = pp->conn_out_get(af, skb, &iph, 0);
 
 	if (likely(cp))
-		return handle_response(af, skb, pd, cp, &iph);
+		return handle_response(af, skb, pd, cp, &iph, hooknum);
 	if (sysctl_nat_icmp_send(net) &&
 	    (pp->protocol == IPPROTO_TCP ||
 	     pp->protocol == IPPROTO_UDP ||
-- 
cgit v1.2.3-59-g8ed1b


From f5553c19ff9058136e7082c0b1f4268e705ea538 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 29 Jan 2015 19:08:09 +0100
Subject: netfilter: nf_tables: fix leaks in error path of nf_tables_newchain()

Release statistics and module refcount on memory allocation problems.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b54360634e95..1ff04bcd4871 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1264,8 +1264,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
 		trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
 					sizeof(struct nft_trans_chain));
-		if (trans == NULL)
+		if (trans == NULL) {
+			free_percpu(stats);
 			return -ENOMEM;
+		}
 
 		nft_trans_chain_stats(trans) = stats;
 		nft_trans_chain_update(trans) = true;
@@ -1321,8 +1323,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		hookfn = type->hooks[hooknum];
 
 		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
-		if (basechain == NULL)
+		if (basechain == NULL) {
+			module_put(type->owner);
 			return -ENOMEM;
+		}
 
 		if (nla[NFTA_CHAIN_COUNTERS]) {
 			stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
-- 
cgit v1.2.3-59-g8ed1b


From 8b7c36d810c61ab16997f4387fc16291410700f8 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Thu, 29 Jan 2015 10:51:53 +0100
Subject: netlink: fix wrong subscription bitmask to group mapping in

The subscription bitmask passed via struct sockaddr_nl is converted to
the group number when calling the netlink_bind() and netlink_unbind()
callbacks.

The conversion is however incorrect since bitmask (1 << 0) needs to be
mapped to group number 1. Note that you cannot specify the group number 0
(usually known as _NONE) from setsockopt() using NETLINK_ADD_MEMBERSHIP
since this is rejected through -EINVAL.

This problem became noticeable since 97840cb ("netfilter: nfnetlink:
fix insufficient validation in nfnetlink_bind") when binding to bitmask
(1 << 0) in ctnetlink.

Reported-by: Andre Tomt <andre@tomt.net>
Reported-by: Ivan Delalande <colona@arista.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 02fdde28dada..75532efa51cd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1438,7 +1438,7 @@ static void netlink_undo_bind(int group, long unsigned int groups,
 
 	for (undo = 0; undo < group; undo++)
 		if (test_bit(undo, &groups))
-			nlk->netlink_unbind(sock_net(sk), undo);
+			nlk->netlink_unbind(sock_net(sk), undo + 1);
 }
 
 static int netlink_bind(struct socket *sock, struct sockaddr *addr,
@@ -1476,7 +1476,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 		for (group = 0; group < nlk->ngroups; group++) {
 			if (!test_bit(group, &groups))
 				continue;
-			err = nlk->netlink_bind(net, group);
+			err = nlk->netlink_bind(net, group + 1);
 			if (!err)
 				continue;
 			netlink_undo_bind(group, groups, sk);
-- 
cgit v1.2.3-59-g8ed1b


From cfbf654efc6d78dc9812e030673b86f235bf677d Mon Sep 17 00:00:00 2001
From: Saran Maruti Ramanara <saran.neti@telus.com>
Date: Thu, 29 Jan 2015 11:05:58 +0100
Subject: net: sctp: fix passing wrong parameter header to param_type2af in
 sctp_process_param

When making use of RFC5061, section 4.2.4. for setting the primary IP
address, we're passing a wrong parameter header to param_type2af(),
resulting always in NULL being returned.

At this point, param.p points to a sctp_addip_param struct, containing
a sctp_paramhdr (type = 0xc004, length = var), and crr_id as a correlation
id. Followed by that, as also presented in RFC5061 section 4.2.4., comes
the actual sctp_addr_param, which also contains a sctp_paramhdr, but
this time with the correct type SCTP_PARAM_IPV{4,6}_ADDRESS that
param_type2af() can make use of. Since we already hold a pointer to
addr_param from previous line, just reuse it for param_type2af().

Fixes: d6de3097592b ("[SCTP]: Add the handling of "Set Primary IP Address" parameter to INIT")
Signed-off-by: Saran Maruti Ramanara <saran.neti@telus.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e49e231cef52..06320c8c1c86 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2608,7 +2608,7 @@ do_addr_param:
 
 		addr_param = param.v + sizeof(sctp_addip_param_t);
 
-		af = sctp_get_af_specific(param_type2af(param.p->type));
+		af = sctp_get_af_specific(param_type2af(addr_param->p.type));
 		if (af == NULL)
 			break;
 
-- 
cgit v1.2.3-59-g8ed1b


From d4bcef3fbe887ff93b58da4fcf6df1eee416e8fa Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Thu, 29 Jan 2015 20:37:07 +0900
Subject: net: Fix vlan_get_protocol for stacked vlan

vlan_get_protocol() could not get network protocol if a skb has a 802.1ad
vlan tag or multiple vlans, which caused incorrect checksum calculation
in several drivers.

Fix vlan_get_protocol() to retrieve network protocol instead of incorrect
vlan protocol.

As the logic is the same as skb_network_protocol(), create a common helper
function __vlan_get_protocol() and call it from existing functions.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 60 +++++++++++++++++++++++++++++++++++++------------
 net/core/dev.c          | 31 +------------------------
 2 files changed, 47 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 515a35e2a48a..960e666c51e4 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -472,27 +472,59 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 /**
  * vlan_get_protocol - get protocol EtherType.
  * @skb: skbuff to query
+ * @type: first vlan protocol
+ * @depth: buffer to store length of eth and vlan tags in bytes
  *
  * Returns the EtherType of the packet, regardless of whether it is
  * vlan encapsulated (normal or hardware accelerated) or not.
  */
-static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
+static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type,
+					 int *depth)
 {
-	__be16 protocol = 0;
-
-	if (vlan_tx_tag_present(skb) ||
-	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
-		protocol = skb->protocol;
-	else {
-		__be16 proto, *protop;
-		protop = skb_header_pointer(skb, offsetof(struct vlan_ethhdr,
-						h_vlan_encapsulated_proto),
-						sizeof(proto), &proto);
-		if (likely(protop))
-			protocol = *protop;
+	unsigned int vlan_depth = skb->mac_len;
+
+	/* if type is 802.1Q/AD then the header should already be
+	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+	 * ETH_HLEN otherwise
+	 */
+	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+		if (vlan_depth) {
+			if (WARN_ON(vlan_depth < VLAN_HLEN))
+				return 0;
+			vlan_depth -= VLAN_HLEN;
+		} else {
+			vlan_depth = ETH_HLEN;
+		}
+		do {
+			struct vlan_hdr *vh;
+
+			if (unlikely(!pskb_may_pull(skb,
+						    vlan_depth + VLAN_HLEN)))
+				return 0;
+
+			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+			type = vh->h_vlan_encapsulated_proto;
+			vlan_depth += VLAN_HLEN;
+		} while (type == htons(ETH_P_8021Q) ||
+			 type == htons(ETH_P_8021AD));
 	}
 
-	return protocol;
+	if (depth)
+		*depth = vlan_depth;
+
+	return type;
+}
+
+/**
+ * vlan_get_protocol - get protocol EtherType.
+ * @skb: skbuff to query
+ *
+ * Returns the EtherType of the packet, regardless of whether it is
+ * vlan encapsulated (normal or hardware accelerated) or not.
+ */
+static inline __be16 vlan_get_protocol(struct sk_buff *skb)
+{
+	return __vlan_get_protocol(skb, skb->protocol, NULL);
 }
 
 static inline void vlan_set_encap_proto(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 171420e75b03..c87a2264a02b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2352,7 +2352,6 @@ EXPORT_SYMBOL(skb_checksum_help);
 
 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 {
-	unsigned int vlan_depth = skb->mac_len;
 	__be16 type = skb->protocol;
 
 	/* Tunnel gso handlers can set protocol to ethernet. */
@@ -2366,35 +2365,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 		type = eth->h_proto;
 	}
 
-	/* if skb->protocol is 802.1Q/AD then the header should already be
-	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
-	 * ETH_HLEN otherwise
-	 */
-	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
-		if (vlan_depth) {
-			if (WARN_ON(vlan_depth < VLAN_HLEN))
-				return 0;
-			vlan_depth -= VLAN_HLEN;
-		} else {
-			vlan_depth = ETH_HLEN;
-		}
-		do {
-			struct vlan_hdr *vh;
-
-			if (unlikely(!pskb_may_pull(skb,
-						    vlan_depth + VLAN_HLEN)))
-				return 0;
-
-			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
-			type = vh->h_vlan_encapsulated_proto;
-			vlan_depth += VLAN_HLEN;
-		} while (type == htons(ETH_P_8021Q) ||
-			 type == htons(ETH_P_8021AD));
-	}
-
-	*depth = vlan_depth;
-
-	return type;
+	return __vlan_get_protocol(skb, type, depth);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From bdbbb8527b6f6a358dbcb70dac247034d665b8e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Jan 2015 21:35:05 -0800
Subject: ipv4: tcp: get rid of ugly unicast_sock

In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :

commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.

commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.

commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression.

commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.

Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.

This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  2 +-
 include/net/netns/ipv4.h |  1 +
 net/ipv4/ip_output.c     | 30 +++---------------------------
 net/ipv4/tcp_ipv4.c      | 37 ++++++++++++++++++++++++++++++++-----
 4 files changed, 37 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index f7cbd703d15d..09cf5aebb283 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 24945cefc4fd..0ffef1a38efc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -52,6 +52,7 @@ struct netns_ipv4 {
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
 	unsigned int		tcp_metrics_hash_log;
+	struct sock  * __percpu	*tcp_sk;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 38a20a9cca1a..c373c0708d97 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 /*
  *	Generic function to send a packet as reply to another packet.
  *	Used to send some TCP resets/acks so far.
- *
- *	Use a fake percpu inet socket to avoid false sharing and contention.
  */
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
-	.sk = {
-		.__sk_common = {
-			.skc_refcnt = ATOMIC_INIT(1),
-		},
-		.sk_wmem_alloc	= ATOMIC_INIT(1),
-		.sk_allocation	= GFP_ATOMIC,
-		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
-		.sk_pacing_rate = ~0U,
-	},
-	.pmtudisc	= IP_PMTUDISC_WANT,
-	.uc_ttl		= -1,
-};
-
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
@@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
 	struct rtable *rt = skb_rtable(skb);
+	struct net *net = sock_net(sk);
 	struct sk_buff *nskb;
-	struct sock *sk;
-	struct inet_sock *inet;
 	int err;
 
 	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
@@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 	if (IS_ERR(rt))
 		return;
 
-	inet = &get_cpu_var(unicast_sock);
+	inet_sk(sk)->tos = arg->tos;
 
-	inet->tos = arg->tos;
-	sk = &inet->sk;
 	sk->sk_priority = skb->priority;
 	sk->sk_protocol = ip_hdr(skb)->protocol;
 	sk->sk_bound_dev_if = arg->bound_dev_if;
-	sock_net_set(sk, net);
-	__skb_queue_head_init(&sk->sk_write_queue);
 	sk->sk_sndbuf = sysctl_wmem_default;
 	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
 			     len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
 		nskb->ip_summed = CHECKSUM_NONE;
-		skb_orphan(nskb);
 		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 		ip_push_pending_frames(sk, &fl4);
 	}
 out:
-	put_cpu_var(unicast_sock);
-
 	ip_rt_put(rt);
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..d22f54482bab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 		arg.bound_dev_if = sk->sk_bound_dev_if;
 
 	arg.tos = ip_hdr(skb)->tos;
-	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 	if (oif)
 		arg.bound_dev_if = oif;
 	arg.tos = tos;
-	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
@@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
 };
 EXPORT_SYMBOL(tcp_prot);
 
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+	free_percpu(net->ipv4.tcp_sk);
+}
+
 static int __net_init tcp_sk_init(struct net *net)
 {
+	int res, cpu;
+
+	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+	if (!net->ipv4.tcp_sk)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct sock *sk;
+
+		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+					   IPPROTO_TCP, net);
+		if (res)
+			goto fail;
+		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+	}
 	net->ipv4.sysctl_tcp_ecn = 2;
 	return 0;
-}
 
-static void __net_exit tcp_sk_exit(struct net *net)
-{
+fail:
+	tcp_sk_exit(net);
+
+	return res;
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
-- 
cgit v1.2.3-59-g8ed1b


From 0508c07f5e0c94f38afd5434e8b2a55b84553077 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 3 Feb 2015 16:36:15 -0500
Subject: ipv6: Select fragment id during UFO segmentation if not set.

If the IPv6 fragment id has not been set and we perform
fragmentation due to UFO, select a new fragment id.
We now consider a fragment id of 0 as unset and if id selection
process returns 0 (after all the pertrubations), we set it to
0x80000000, thus giving us ample space not to create collisions
with the next packet we may have to fragment.

When doing UFO integrity checking, we also select the
fragment id if it has not be set yet.   This is stored into
the skb_shinfo() thus allowing UFO to function correclty.

This patch also removes duplicate fragment id generation code
and moves ipv6_select_ident() into the header as it may be
used during GSO.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h     |  3 +++
 net/ipv6/ip6_output.c  | 14 --------------
 net/ipv6/output_core.c | 41 +++++++++++++++++++++++++++++++++++------
 net/ipv6/udp_offload.c | 10 +++++++++-
 4 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929392b0..9bf85d34c024 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -671,6 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
+u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst,
+			struct in6_addr *src);
+void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
 void ipv6_proxy_select_ident(struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12ae48c..d28f2a2efb32 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -537,20 +537,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	skb_copy_secmark(to, from);
 }
 
-static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
-{
-	static u32 ip6_idents_hashrnd __read_mostly;
-	u32 hash, id;
-
-	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
-
-	hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
-	hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
-
-	id = ip_idents_reserve(hash, 1);
-	fhdr->identification = htonl(id);
-}
-
 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 97f41a3e68d9..54520a0bd5e3 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -9,6 +9,24 @@
 #include <net/addrconf.h>
 #include <net/secure_seq.h>
 
+u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst, struct in6_addr *src)
+{
+	u32 hash, id;
+
+	hash = __ipv6_addr_jhash(dst, hashrnd);
+	hash = __ipv6_addr_jhash(src, hash);
+
+	/* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
+	 * set the hight order instead thus minimizing possible future
+	 * collisions.
+	 */
+	id = ip_idents_reserve(hash, 1);
+	if (unlikely(!id))
+		id = 1 << 31;
+
+	return id;
+}
+
 /* This function exists only for tap drivers that must support broken
  * clients requesting UFO without specifying an IPv6 fragment ID.
  *
@@ -22,7 +40,7 @@ void ipv6_proxy_select_ident(struct sk_buff *skb)
 	static u32 ip6_proxy_idents_hashrnd __read_mostly;
 	struct in6_addr buf[2];
 	struct in6_addr *addrs;
-	u32 hash, id;
+	u32 id;
 
 	addrs = skb_header_pointer(skb,
 				   skb_network_offset(skb) +
@@ -34,14 +52,25 @@ void ipv6_proxy_select_ident(struct sk_buff *skb)
 	net_get_random_once(&ip6_proxy_idents_hashrnd,
 			    sizeof(ip6_proxy_idents_hashrnd));
 
-	hash = __ipv6_addr_jhash(&addrs[1], ip6_proxy_idents_hashrnd);
-	hash = __ipv6_addr_jhash(&addrs[0], hash);
-
-	id = ip_idents_reserve(hash, 1);
-	skb_shinfo(skb)->ip6_frag_id = htonl(id);
+	id = __ipv6_select_ident(ip6_proxy_idents_hashrnd,
+				 &addrs[1], &addrs[0]);
+	skb_shinfo(skb)->ip6_frag_id = id;
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
 
+void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
+{
+	static u32 ip6_idents_hashrnd __read_mostly;
+	u32 id;
+
+	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
+
+	id = __ipv6_select_ident(ip6_idents_hashrnd, &rt->rt6i_dst.addr,
+				 &rt->rt6i_src.addr);
+	fhdr->identification = htonl(id);
+}
+EXPORT_SYMBOL(ipv6_select_ident);
+
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 {
 	u16 offset = sizeof(struct ipv6hdr);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index b6aa8ed18257..a56276996b72 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -52,6 +52,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
 
+		/* Set the IPv6 fragment id if not set yet */
+		if (!skb_shinfo(skb)->ip6_frag_id)
+			ipv6_proxy_select_ident(skb);
+
 		segs = NULL;
 		goto out;
 	}
@@ -108,7 +112,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
 		fptr->nexthdr = nexthdr;
 		fptr->reserved = 0;
-		fptr->identification = skb_shinfo(skb)->ip6_frag_id;
+		if (skb_shinfo(skb)->ip6_frag_id)
+			fptr->identification = skb_shinfo(skb)->ip6_frag_id;
+		else
+			ipv6_select_ident(fptr,
+					  (struct rt6_info *)skb_dst(skb));
 
 		/* Fragment the skb. ipv6 header and the remaining fields of the
 		 * fragment header are updated in ipv6_gso_segment()
-- 
cgit v1.2.3-59-g8ed1b


From db27ebb111e9f69efece08e4cb6a34ff980f8896 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 3 Feb 2015 08:55:58 -0500
Subject: net: rds: use correct size for max unacked packets and bytes

Max unacked packets/bytes is an int while sizeof(long) was used in the
sysctl table.

This means that when they were getting read we'd also leak kernel memory
to userspace along with the timeout values.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index c3b0cd43eb56..c173f69e1479 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -71,14 +71,14 @@ static struct ctl_table rds_sysctl_rds_table[] = {
 	{
 		.procname	= "max_unacked_packets",
 		.data		= &rds_sysctl_max_unacked_packets,
-		.maxlen         = sizeof(unsigned long),
+		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec,
 	},
 	{
 		.procname	= "max_unacked_bytes",
 		.data		= &rds_sysctl_max_unacked_bytes,
-		.maxlen         = sizeof(unsigned long),
+		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec,
 	},
-- 
cgit v1.2.3-59-g8ed1b


From 3725a269815ba6dbb415feddc47da5af7d1fac58 Mon Sep 17 00:00:00 2001
From: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Date: Tue, 3 Feb 2015 17:49:18 +0100
Subject: pkt_sched: fq: avoid hang when quantum 0

Configuring fq with quantum 0 hangs the system, presumably because of a
non-interruptible infinite loop. Either way quantum 0 does not make sense.

Reproduce with:
sudo tc qdisc add dev lo root fq quantum 0 initial_quantum 0
ping 127.0.0.1

Signed-off-by: Kenneth Klette Jonassen <kennetkl@ifi.uio.no>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 9b05924cc386..333cd94ba381 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -670,8 +670,14 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_FLOW_PLIMIT])
 		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
 
-	if (tb[TCA_FQ_QUANTUM])
-		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+	if (tb[TCA_FQ_QUANTUM]) {
+		u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+		if (quantum > 0)
+			q->quantum = quantum;
+		else
+			err = -EINVAL;
+	}
 
 	if (tb[TCA_FQ_INITIAL_QUANTUM])
 		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
-- 
cgit v1.2.3-59-g8ed1b


From b057df24a7536cce6c372efe9d0e3d1558afedf4 Mon Sep 17 00:00:00 2001
From: Ignacy Gawędzki <ignacy.gawedzki@green-communications.fr>
Date: Tue, 3 Feb 2015 19:05:18 +0100
Subject: cls_api.c: Fix dumping of non-existing actions' stats.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In tcf_exts_dump_stats(), ensure that exts->actions is not empty before
accessing the first element of that list and calling tcf_action_copy_stats()
on it.  This fixes some random segvs when adding filters of type "basic" with
no particular action.

This also fixes the dumping of those "no-action" filters, which more often
than not made calls to tcf_action_copy_stats() fail and consequently netlink
attributes added by the caller to be removed by a call to nla_nest_cancel().

Fixes: 33be62715991 ("net_sched: act: use standard struct list_head")
Signed-off-by: Ignacy Gawędzki <ignacy.gawedzki@green-communications.fr>
Acked-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index aad6a679fb13..baef987fe2c0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -556,8 +556,9 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 }
 EXPORT_SYMBOL(tcf_exts_change);
 
-#define tcf_exts_first_act(ext) \
-		list_first_entry(&(exts)->actions, struct tc_action, list)
+#define tcf_exts_first_act(ext)					\
+	list_first_entry_or_null(&(exts)->actions,		\
+				 struct tc_action, list)
 
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
 {
@@ -603,7 +604,7 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	struct tc_action *a = tcf_exts_first_act(exts);
-	if (tcf_action_copy_stats(skb, a, 1) < 0)
+	if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0)
 		return -1;
 #endif
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From d1e158e2d7a0a91110b206653f0e02376e809150 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 4 Feb 2015 15:25:09 +0100
Subject: ip6_gre: fix endianness errors in ip6gre_err

info is in network byte order, change it back to host byte order
before use. In particular, the current code sets the MTU of the tunnel
to a wrong (too big) value.

Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 13cda4c6313b..01ccc28a686f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -417,7 +417,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (code == ICMPV6_HDR_FIELD)
 			teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
 
-		if (teli && teli == info - 2) {
+		if (teli && teli == be32_to_cpu(info) - 2) {
 			tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
 			if (tel->encap_limit == 0) {
 				net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
@@ -429,7 +429,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		}
 		break;
 	case ICMPV6_PKT_TOOBIG:
-		mtu = info - offset;
+		mtu = be32_to_cpu(info) - offset;
 		if (mtu < IPV6_MIN_MTU)
 			mtu = IPV6_MIN_MTU;
 		t->dev->mtu = mtu;
-- 
cgit v1.2.3-59-g8ed1b


From 2ce1ee1780564ba06ab4c1434aa03e347dc9169f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Feb 2015 13:37:44 -0800
Subject: net: remove some sparse warnings

netdev_adjacent_add_links() and netdev_adjacent_del_links()
are static.

queue->qdisc has __rcu annotation, need to use RCU_INIT_POINTER()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index c87a2264a02b..7fe82929f509 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5294,7 +5294,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-void netdev_adjacent_add_links(struct net_device *dev)
+static void netdev_adjacent_add_links(struct net_device *dev)
 {
 	struct netdev_adjacent *iter;
 
@@ -5319,7 +5319,7 @@ void netdev_adjacent_add_links(struct net_device *dev)
 	}
 }
 
-void netdev_adjacent_del_links(struct net_device *dev)
+static void netdev_adjacent_del_links(struct net_device *dev)
 {
 	struct netdev_adjacent *iter;
 
@@ -6627,7 +6627,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 	if (!queue)
 		return NULL;
 	netdev_init_one_queue(dev, queue, NULL);
-	queue->qdisc = &noop_qdisc;
+	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
 	queue->qdisc_sleeping = &noop_qdisc;
 	rcu_assign_pointer(dev->ingress_queue, queue);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From a409caecb2e17fc475533738dd1c69b32e13fe09 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Feb 2015 15:12:04 -0800
Subject: sit: fix some __be16/u16 mismatches

Fixes following sparse warnings :

net/ipv6/sit.c:1509:32: warning: incorrect type in assignment (different base types)
net/ipv6/sit.c:1509:32:    expected restricted __be16 [usertype] sport
net/ipv6/sit.c:1509:32:    got unsigned short
net/ipv6/sit.c:1514:32: warning: incorrect type in assignment (different base types)
net/ipv6/sit.c:1514:32:    expected restricted __be16 [usertype] dport
net/ipv6/sit.c:1514:32:    got unsigned short
net/ipv6/sit.c:1711:38: warning: incorrect type in argument 3 (different base types)
net/ipv6/sit.c:1711:38:    expected unsigned short [unsigned] [usertype] value
net/ipv6/sit.c:1711:38:    got restricted __be16 [usertype] sport
net/ipv6/sit.c:1713:38: warning: incorrect type in argument 3 (different base types)
net/ipv6/sit.c:1713:38:    expected unsigned short [unsigned] [usertype] value
net/ipv6/sit.c:1713:38:    got restricted __be16 [usertype] dport

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 213546bd6d5d..cdbfe5af6187 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1506,12 +1506,12 @@ static bool ipip6_netlink_encap_parms(struct nlattr *data[],
 
 	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
 		ret = true;
-		ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]);
+		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
 	}
 
 	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
 		ret = true;
-		ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]);
+		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
 	}
 
 	return ret;
@@ -1707,9 +1707,9 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
 			tunnel->encap.type) ||
-	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT,
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
 			tunnel->encap.sport) ||
-	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT,
+	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
 			tunnel->encap.dport) ||
 	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
 			tunnel->encap.flags))
-- 
cgit v1.2.3-59-g8ed1b