From 5e33a23ba4b56c109b732d57a0a76558a37d9ec5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 5 Oct 2018 14:05:34 +0100
Subject: rxrpc: Fix some missed refs to init_net

Fix some refs to init_net that should've been changed to the appropriate
network namespace.

Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing")
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
---
 net/rxrpc/ar-internal.h | 10 ++++++----
 net/rxrpc/call_accept.c |  2 +-
 net/rxrpc/call_object.c |  4 ++--
 net/rxrpc/conn_client.c | 10 ++++++----
 net/rxrpc/input.c       |  4 ++--
 net/rxrpc/peer_object.c | 28 +++++++++++++++++-----------
 6 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ef9554131434..63c43b3a2096 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -891,8 +891,9 @@ extern unsigned long rxrpc_conn_idle_client_fast_expiry;
 extern struct idr rxrpc_client_conn_ids;
 
 void rxrpc_destroy_client_conn_ids(void);
-int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
-		       struct sockaddr_rxrpc *, gfp_t);
+int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *,
+		       struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *,
+		       gfp_t);
 void rxrpc_expose_client_call(struct rxrpc_call *);
 void rxrpc_disconnect_client_call(struct rxrpc_call *);
 void rxrpc_put_client_conn(struct rxrpc_connection *);
@@ -1045,10 +1046,11 @@ void rxrpc_peer_keepalive_worker(struct work_struct *);
  */
 struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
 					 const struct sockaddr_rxrpc *);
-struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *,
 				     struct sockaddr_rxrpc *, gfp_t);
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
-void rxrpc_new_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *);
+void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *,
+			     struct rxrpc_peer *);
 void rxrpc_destroy_all_peers(struct rxrpc_net *);
 struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
 struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 9c7f26d06a52..f55f67894465 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -287,7 +287,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
 					  (peer_tail + 1) &
 					  (RXRPC_BACKLOG_MAX - 1));
 
-			rxrpc_new_incoming_peer(local, peer);
+			rxrpc_new_incoming_peer(rx, local, peer);
 		}
 
 		/* Now allocate and set up the connection */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 799f75b6900d..0ca2c2dfd196 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -287,7 +287,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 	/* Set up or get a connection record and set the protocol parameters,
 	 * including channel number and call ID.
 	 */
-	ret = rxrpc_connect_call(call, cp, srx, gfp);
+	ret = rxrpc_connect_call(rx, call, cp, srx, gfp);
 	if (ret < 0)
 		goto error;
 
@@ -339,7 +339,7 @@ int rxrpc_retry_client_call(struct rxrpc_sock *rx,
 	/* Set up or get a connection record and set the protocol parameters,
 	 * including channel number and call ID.
 	 */
-	ret = rxrpc_connect_call(call, cp, srx, gfp);
+	ret = rxrpc_connect_call(rx, call, cp, srx, gfp);
 	if (ret < 0)
 		goto error;
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 8acf74fe24c0..521189f4b666 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -276,7 +276,8 @@ dont_reuse:
  * If we return with a connection, the call will be on its waiting list.  It's
  * left to the caller to assign a channel and wake up the call.
  */
-static int rxrpc_get_client_conn(struct rxrpc_call *call,
+static int rxrpc_get_client_conn(struct rxrpc_sock *rx,
+				 struct rxrpc_call *call,
 				 struct rxrpc_conn_parameters *cp,
 				 struct sockaddr_rxrpc *srx,
 				 gfp_t gfp)
@@ -289,7 +290,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
 
 	_enter("{%d,%lx},", call->debug_id, call->user_call_ID);
 
-	cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp);
+	cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp);
 	if (!cp->peer)
 		goto error;
 
@@ -683,7 +684,8 @@ out:
  * find a connection for a call
  * - called in process context with IRQs enabled
  */
-int rxrpc_connect_call(struct rxrpc_call *call,
+int rxrpc_connect_call(struct rxrpc_sock *rx,
+		       struct rxrpc_call *call,
 		       struct rxrpc_conn_parameters *cp,
 		       struct sockaddr_rxrpc *srx,
 		       gfp_t gfp)
@@ -696,7 +698,7 @@ int rxrpc_connect_call(struct rxrpc_call *call,
 	rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper);
 	rxrpc_cull_active_client_conns(rxnet);
 
-	ret = rxrpc_get_client_conn(call, cp, srx, gfp);
+	ret = rxrpc_get_client_conn(rx, call, cp, srx, gfp);
 	if (ret < 0)
 		goto out;
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 800f5b8a1baa..c5af9955665b 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1156,12 +1156,12 @@ void rxrpc_data_ready(struct sock *udp_sk)
 	/* we'll probably need to checksum it (didn't call sock_recvmsg) */
 	if (skb_checksum_complete(skb)) {
 		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
-		__UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
+		__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INERRORS, 0);
 		_leave(" [CSUM failed]");
 		return;
 	}
 
-	__UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
+	__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INDATAGRAMS, 0);
 
 	/* The UDP protocol already released all skb resources;
 	 * we are free to add our own data there.
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 01a9febfa367..2d39eaf19620 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -153,8 +153,10 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
  * assess the MTU size for the network interface through which this peer is
  * reached
  */
-static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
+static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx,
+				  struct rxrpc_peer *peer)
 {
+	struct net *net = sock_net(&rx->sk);
 	struct dst_entry *dst;
 	struct rtable *rt;
 	struct flowi fl;
@@ -169,7 +171,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 	switch (peer->srx.transport.family) {
 	case AF_INET:
 		rt = ip_route_output_ports(
-			&init_net, fl4, NULL,
+			net, fl4, NULL,
 			peer->srx.transport.sin.sin_addr.s_addr, 0,
 			htons(7000), htons(7001), IPPROTO_UDP, 0, 0);
 		if (IS_ERR(rt)) {
@@ -188,7 +190,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 		       sizeof(struct in6_addr));
 		fl6->fl6_dport = htons(7001);
 		fl6->fl6_sport = htons(7000);
-		dst = ip6_route_output(&init_net, NULL, fl6);
+		dst = ip6_route_output(net, NULL, fl6);
 		if (dst->error) {
 			_leave(" [route err %d]", dst->error);
 			return;
@@ -240,10 +242,11 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 /*
  * Initialise peer record.
  */
-static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
+static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer,
+			    unsigned long hash_key)
 {
 	peer->hash_key = hash_key;
-	rxrpc_assess_MTU_size(peer);
+	rxrpc_assess_MTU_size(rx, peer);
 	peer->mtu = peer->if_mtu;
 	peer->rtt_last_req = ktime_get_real();
 
@@ -275,7 +278,8 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
 /*
  * Set up a new peer.
  */
-static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
+static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx,
+					    struct rxrpc_local *local,
 					    struct sockaddr_rxrpc *srx,
 					    unsigned long hash_key,
 					    gfp_t gfp)
@@ -287,7 +291,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
 	peer = rxrpc_alloc_peer(local, gfp);
 	if (peer) {
 		memcpy(&peer->srx, srx, sizeof(*srx));
-		rxrpc_init_peer(peer, hash_key);
+		rxrpc_init_peer(rx, peer, hash_key);
 	}
 
 	_leave(" = %p", peer);
@@ -299,14 +303,15 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
  * since we've already done a search in the list from the non-reentrant context
  * (the data_ready handler) that is the only place we can add new peers.
  */
-void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer)
+void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local,
+			     struct rxrpc_peer *peer)
 {
 	struct rxrpc_net *rxnet = local->rxnet;
 	unsigned long hash_key;
 
 	hash_key = rxrpc_peer_hash_key(local, &peer->srx);
 	peer->local = local;
-	rxrpc_init_peer(peer, hash_key);
+	rxrpc_init_peer(rx, peer, hash_key);
 
 	spin_lock(&rxnet->peer_hash_lock);
 	hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
@@ -317,7 +322,8 @@ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer)
 /*
  * obtain a remote transport endpoint for the specified address
  */
-struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
+				     struct rxrpc_local *local,
 				     struct sockaddr_rxrpc *srx, gfp_t gfp)
 {
 	struct rxrpc_peer *peer, *candidate;
@@ -337,7 +343,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 		/* The peer is not yet present in hash - create a candidate
 		 * for a new record and then redo the search.
 		 */
-		candidate = rxrpc_create_peer(local, srx, hash_key, gfp);
+		candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp);
 		if (!candidate) {
 			_leave(" = NULL [nomem]");
 			return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 2cfa2271604bb26e75b828d38f357ed084464795 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 5 Oct 2018 14:05:35 +0100
Subject: rxrpc: Fix the data_ready handler

Fix the rxrpc_data_ready() function to pick up all packets and to not miss
any.  There are two problems:

 (1) The sk_data_ready pointer on the UDP socket is set *after* it is
     bound.  This means that it's open for business before we're ready to
     dequeue packets and there's a tiny window exists in which a packet can
     sneak onto the receive queue, but we never know about it.

     Fix this by setting the pointers on the socket prior to binding it.

 (2) skb_recv_udp() will return an error (such as ENETUNREACH) if there was
     an error on the transmission side, even though we set the
     sk_error_report hook.  Because rxrpc_data_ready() returns immediately
     in such a case, it never actually removes its packet from the receive
     queue.

     Fix this by abstracting out the UDP dequeuing and checksumming into a
     separate function that keeps hammering on skb_recv_udp() until it
     returns -EAGAIN, passing the packets extracted to the remainder of the
     function.

and two potential problems:

 (3) It might be possible in some circumstances or in the future for
     packets to be being added to the UDP receive queue whilst rxrpc is
     running consuming them, so the data_ready() handler might get called
     less often than once per packet.

     Allow for this by fully draining the queue on each call as (2).

 (4) If a packet fails the checksum check, the code currently returns after
     discarding the packet without checking for more.

     Allow for this by fully draining the queue on each call as (2).

Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both")
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
---
 net/rxrpc/input.c        | 68 +++++++++++++++++++++++++++---------------------
 net/rxrpc/local_object.c | 11 ++++----
 2 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c5af9955665b..c3114fa66c92 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1121,7 +1121,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
  * shut down and the local endpoint from going away, thus sk_user_data will not
  * be cleared until this function returns.
  */
-void rxrpc_data_ready(struct sock *udp_sk)
+void rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 {
 	struct rxrpc_connection *conn;
 	struct rxrpc_channel *chan;
@@ -1130,39 +1130,11 @@ void rxrpc_data_ready(struct sock *udp_sk)
 	struct rxrpc_local *local = udp_sk->sk_user_data;
 	struct rxrpc_peer *peer = NULL;
 	struct rxrpc_sock *rx = NULL;
-	struct sk_buff *skb;
 	unsigned int channel;
-	int ret, skew = 0;
+	int skew = 0;
 
 	_enter("%p", udp_sk);
 
-	ASSERT(!irqs_disabled());
-
-	skb = skb_recv_udp(udp_sk, 0, 1, &ret);
-	if (!skb) {
-		if (ret == -EAGAIN)
-			return;
-		_debug("UDP socket error %d", ret);
-		return;
-	}
-
-	if (skb->tstamp == 0)
-		skb->tstamp = ktime_get_real();
-
-	rxrpc_new_skb(skb, rxrpc_skb_rx_received);
-
-	_net("recv skb %p", skb);
-
-	/* we'll probably need to checksum it (didn't call sock_recvmsg) */
-	if (skb_checksum_complete(skb)) {
-		rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
-		__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INERRORS, 0);
-		_leave(" [CSUM failed]");
-		return;
-	}
-
-	__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INDATAGRAMS, 0);
-
 	/* The UDP protocol already released all skb resources;
 	 * we are free to add our own data there.
 	 */
@@ -1181,6 +1153,8 @@ void rxrpc_data_ready(struct sock *udp_sk)
 		}
 	}
 
+	if (skb->tstamp == 0)
+		skb->tstamp = ktime_get_real();
 	trace_rxrpc_rx_packet(sp);
 
 	switch (sp->hdr.type) {
@@ -1398,3 +1372,37 @@ reject_packet:
 	rxrpc_reject_packet(local, skb);
 	_leave(" [badmsg]");
 }
+
+void rxrpc_data_ready(struct sock *udp_sk)
+{
+	struct sk_buff *skb;
+	int ret;
+
+	for (;;) {
+		skb = skb_recv_udp(udp_sk, 0, 1, &ret);
+		if (!skb) {
+			if (ret == -EAGAIN)
+				return;
+
+			/* If there was a transmission failure, we get an error
+			 * here that we need to ignore.
+			 */
+			_debug("UDP socket error %d", ret);
+			continue;
+		}
+
+		rxrpc_new_skb(skb, rxrpc_skb_rx_received);
+
+		/* we'll probably need to checksum it (didn't call sock_recvmsg) */
+		if (skb_checksum_complete(skb)) {
+			rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+			__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INERRORS, 0);
+			_debug("csum failed");
+			continue;
+		}
+
+		__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INDATAGRAMS, 0);
+
+		rxrpc_input_packet(udp_sk, skb);
+	}
+}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 94d234e9c685..30862f44c9f1 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -122,6 +122,12 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		return ret;
 	}
 
+	/* set the socket up */
+	sock = local->socket->sk;
+	sock->sk_user_data	= local;
+	sock->sk_data_ready	= rxrpc_data_ready;
+	sock->sk_error_report	= rxrpc_error_report;
+
 	/* if a local address was supplied then bind it */
 	if (local->srx.transport_len > sizeof(sa_family_t)) {
 		_debug("bind");
@@ -191,11 +197,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 		BUG();
 	}
 
-	/* set the socket up */
-	sock = local->socket->sk;
-	sock->sk_user_data	= local;
-	sock->sk_data_ready	= rxrpc_data_ready;
-	sock->sk_error_report	= rxrpc_error_report;
 	_leave(" = 0");
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 329e09893909d409039f6a79757d9b80b67efe39 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 5 Oct 2018 16:21:46 -0700
Subject: treewide: Replace more open-coded allocation size multiplications

As done treewide earlier, this catches several more open-coded
allocation size calculations that were added to the kernel during the
merge window. This performs the following mechanical transformations
using Coccinelle:

	kvmalloc(a * b, ...) -> kvmalloc_array(a, b, ...)
	kvzalloc(a * b, ...) -> kvcalloc(a, b, ...)
	devm_kzalloc(..., a * b, ...) -> devm_kcalloc(..., a, b, ...)

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/bluetooth/hci_qca.c                 |  2 +-
 drivers/crypto/inside-secure/safexcel.c     |  8 +++++---
 drivers/gpu/drm/mediatek/mtk_drm_crtc.c     |  2 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_io_util.c |  4 ++--
 drivers/hwmon/npcm750-pwm-fan.c             |  2 +-
 drivers/md/dm-integrity.c                   |  3 ++-
 drivers/net/wireless/mediatek/mt76/usb.c    | 10 +++++-----
 drivers/pci/controller/pcie-cadence.c       |  4 ++--
 drivers/tty/serial/qcom_geni_serial.c       |  4 ++--
 net/sched/sch_cake.c                        |  2 +-
 10 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index e182f6019f68..2fee65886d50 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -1322,7 +1322,7 @@ static int qca_init_regulators(struct qca_power *qca,
 {
 	int i;
 
-	qca->vreg_bulk = devm_kzalloc(qca->dev, num_vregs *
+	qca->vreg_bulk = devm_kcalloc(qca->dev, num_vregs,
 				      sizeof(struct regulator_bulk_data),
 				      GFP_KERNEL);
 	if (!qca->vreg_bulk)
diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c
index 7e71043457a6..86c699c14f84 100644
--- a/drivers/crypto/inside-secure/safexcel.c
+++ b/drivers/crypto/inside-secure/safexcel.c
@@ -1044,7 +1044,8 @@ static int safexcel_probe(struct platform_device *pdev)
 
 	safexcel_configure(priv);
 
-	priv->ring = devm_kzalloc(dev, priv->config.rings * sizeof(*priv->ring),
+	priv->ring = devm_kcalloc(dev, priv->config.rings,
+				  sizeof(*priv->ring),
 				  GFP_KERNEL);
 	if (!priv->ring) {
 		ret = -ENOMEM;
@@ -1063,8 +1064,9 @@ static int safexcel_probe(struct platform_device *pdev)
 		if (ret)
 			goto err_reg_clk;
 
-		priv->ring[i].rdr_req = devm_kzalloc(dev,
-			sizeof(priv->ring[i].rdr_req) * EIP197_DEFAULT_RING_SIZE,
+		priv->ring[i].rdr_req = devm_kcalloc(dev,
+			EIP197_DEFAULT_RING_SIZE,
+			sizeof(priv->ring[i].rdr_req),
 			GFP_KERNEL);
 		if (!priv->ring[i].rdr_req) {
 			ret = -ENOMEM;
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
index 0b976dfd04df..92ecb9bf982c 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
@@ -600,7 +600,7 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev,
 	}
 
 	mtk_crtc->layer_nr = mtk_ddp_comp_layer_nr(mtk_crtc->ddp_comp[0]);
-	mtk_crtc->planes = devm_kzalloc(dev, mtk_crtc->layer_nr *
+	mtk_crtc->planes = devm_kcalloc(dev, mtk_crtc->layer_nr,
 					sizeof(struct drm_plane),
 					GFP_KERNEL);
 
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_io_util.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_io_util.c
index 790d39f816dc..b557687b1964 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_io_util.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_io_util.c
@@ -153,8 +153,8 @@ int msm_dss_parse_clock(struct platform_device *pdev,
 		return 0;
 	}
 
-	mp->clk_config = devm_kzalloc(&pdev->dev,
-				      sizeof(struct dss_clk) * num_clk,
+	mp->clk_config = devm_kcalloc(&pdev->dev,
+				      num_clk, sizeof(struct dss_clk),
 				      GFP_KERNEL);
 	if (!mp->clk_config)
 		return -ENOMEM;
diff --git a/drivers/hwmon/npcm750-pwm-fan.c b/drivers/hwmon/npcm750-pwm-fan.c
index 8474d601aa63..b998f9fbed41 100644
--- a/drivers/hwmon/npcm750-pwm-fan.c
+++ b/drivers/hwmon/npcm750-pwm-fan.c
@@ -908,7 +908,7 @@ static int npcm7xx_en_pwm_fan(struct device *dev,
 	if (fan_cnt < 1)
 		return -EINVAL;
 
-	fan_ch = devm_kzalloc(dev, sizeof(*fan_ch) * fan_cnt, GFP_KERNEL);
+	fan_ch = devm_kcalloc(dev, fan_cnt, sizeof(*fan_ch), GFP_KERNEL);
 	if (!fan_ch)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 378878599466..244ff5580f82 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3462,7 +3462,8 @@ try_smaller_buffer:
 			r = -ENOMEM;
 			goto bad;
 		}
-		ic->recalc_tags = kvmalloc((RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size, GFP_KERNEL);
+		ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block,
+						 ic->tag_size, GFP_KERNEL);
 		if (!ic->recalc_tags) {
 			ti->error = "Cannot allocate tags for recalculating";
 			r = -ENOMEM;
diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c
index 7780b07543bb..79e59f2379a2 100644
--- a/drivers/net/wireless/mediatek/mt76/usb.c
+++ b/drivers/net/wireless/mediatek/mt76/usb.c
@@ -258,7 +258,7 @@ int mt76u_buf_alloc(struct mt76_dev *dev, struct mt76u_buf *buf,
 	if (!buf->urb)
 		return -ENOMEM;
 
-	buf->urb->sg = devm_kzalloc(dev->dev, nsgs * sizeof(*buf->urb->sg),
+	buf->urb->sg = devm_kcalloc(dev->dev, nsgs, sizeof(*buf->urb->sg),
 				    gfp);
 	if (!buf->urb->sg)
 		return -ENOMEM;
@@ -464,8 +464,8 @@ static int mt76u_alloc_rx(struct mt76_dev *dev)
 	int i, err, nsgs;
 
 	spin_lock_init(&q->lock);
-	q->entry = devm_kzalloc(dev->dev,
-				MT_NUM_RX_ENTRIES * sizeof(*q->entry),
+	q->entry = devm_kcalloc(dev->dev,
+				MT_NUM_RX_ENTRIES, sizeof(*q->entry),
 				GFP_KERNEL);
 	if (!q->entry)
 		return -ENOMEM;
@@ -717,8 +717,8 @@ static int mt76u_alloc_tx(struct mt76_dev *dev)
 		INIT_LIST_HEAD(&q->swq);
 		q->hw_idx = q2hwq(i);
 
-		q->entry = devm_kzalloc(dev->dev,
-					MT_NUM_TX_ENTRIES * sizeof(*q->entry),
+		q->entry = devm_kcalloc(dev->dev,
+					MT_NUM_TX_ENTRIES, sizeof(*q->entry),
 					GFP_KERNEL);
 		if (!q->entry)
 			return -ENOMEM;
diff --git a/drivers/pci/controller/pcie-cadence.c b/drivers/pci/controller/pcie-cadence.c
index 86f1b002c846..975bcdd6b5c0 100644
--- a/drivers/pci/controller/pcie-cadence.c
+++ b/drivers/pci/controller/pcie-cadence.c
@@ -180,11 +180,11 @@ int cdns_pcie_init_phy(struct device *dev, struct cdns_pcie *pcie)
 		return 0;
 	}
 
-	phy = devm_kzalloc(dev, sizeof(*phy) * phy_count, GFP_KERNEL);
+	phy = devm_kcalloc(dev, phy_count, sizeof(*phy), GFP_KERNEL);
 	if (!phy)
 		return -ENOMEM;
 
-	link = devm_kzalloc(dev, sizeof(*link) * phy_count, GFP_KERNEL);
+	link = devm_kcalloc(dev, phy_count, sizeof(*link), GFP_KERNEL);
 	if (!link)
 		return -ENOMEM;
 
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 29ec34387246..1515074e18fb 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -868,8 +868,8 @@ static int qcom_geni_serial_port_setup(struct uart_port *uport)
 	geni_se_init(&port->se, port->rx_wm, port->rx_rfr);
 	geni_se_select_mode(&port->se, port->xfer_mode);
 	if (!uart_console(uport)) {
-		port->rx_fifo = devm_kzalloc(uport->dev,
-			port->rx_fifo_depth * sizeof(u32), GFP_KERNEL);
+		port->rx_fifo = devm_kcalloc(uport->dev,
+			port->rx_fifo_depth, sizeof(u32), GFP_KERNEL);
 		if (!port->rx_fifo)
 			return -ENOMEM;
 	}
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index c07c30b916d5..793016d722ec 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -2644,7 +2644,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
 	for (i = 1; i <= CAKE_QUEUES; i++)
 		quantum_div[i] = 65535 / i;
 
-	q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
+	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
 			   GFP_KERNEL);
 	if (!q->tins)
 		goto nomem;
-- 
cgit v1.2.3-59-g8ed1b


From 7e823644b60555f70f241274b8d0120dd919269a Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 4 Oct 2018 13:37:32 +0200
Subject: udp: Unbreak modules that rely on external __skb_recv_udp()
 availability

Commit 2276f58ac589 ("udp: use a separate rx queue for packet reception")
turned static inline __skb_recv_udp() from being a trivial helper around
__skb_recv_datagram() into a UDP specific implementaion, making it
EXPORT_SYMBOL_GPL() at the same time.

There are external modules that got broken by __skb_recv_udp() not being
visible to them. Let's unbreak them by making __skb_recv_udp EXPORT_SYMBOL().

Rationale (one of those) why this is actually "technically correct" thing
to do: __skb_recv_udp() used to be an inline wrapper around
__skb_recv_datagram(), which itself (still, and correctly so, I believe)
is EXPORT_SYMBOL().

Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d69dd6fa7e8..c32a4c16b7ff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1627,7 +1627,7 @@ busy_check:
 	*err = error;
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__skb_recv_udp);
+EXPORT_SYMBOL(__skb_recv_udp);
 
 /*
  * 	This should be easy, if there is something there we
-- 
cgit v1.2.3-59-g8ed1b


From 6d4c407744dd0338da5d5d76f40dce5adabfb30a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Oct 2018 07:40:17 -0400
Subject: net: sched: cls_u32: fix hnode refcounting

cls_u32.c misuses refcounts for struct tc_u_hnode - it counts references
via ->hlist and via ->tp_root together.  u32_destroy() drops the former
and, in case when there had been links, leaves the sucker on the list.
As the result, there's nothing to protect it from getting freed once links
are dropped.
That also makes the "is it busy" check incapable of catching the root
hnode - it *is* busy (there's a reference from tp), but we don't see it as
something separate.  "Is it our root?" check partially covers that, but
the problem exists for others' roots as well.

AFAICS, the minimal fix preserving the existing behaviour (where it doesn't
include oopsen, that is) would be this:
        * count tp->root and tp_c->hlist as separate references.  I.e.
have u32_init() set refcount to 2, not 1.
	* in u32_destroy() we always drop the former;
in u32_destroy_hnode() - the latter.

	That way we have *all* references contributing to refcount.  List
removal happens in u32_destroy_hnode() (called only when ->refcnt is 1)
an in u32_destroy() in case of tc_u_common going away, along with
everything reachable from it.  IOW, that way we know that
u32_destroy_key() won't free something still on the list (or pointed to by
someone's ->root).

Reproducer:

tc qdisc add dev eth0 ingress
tc filter add dev eth0 parent ffff: protocol ip prio 100 handle 1: \
u32 divisor 1
tc filter add dev eth0 parent ffff: protocol ip prio 200 handle 2: \
u32 divisor 1
tc filter add dev eth0 parent ffff: protocol ip prio 100 \
handle 1:0:11 u32 ht 1: link 801: offset at 0 mask 0f00 shift 6 \
plus 0 eat match ip protocol 6 ff
tc filter delete dev eth0 parent ffff: protocol ip prio 200
tc filter change dev eth0 parent ffff: protocol ip prio 100 \
handle 1:0:11 u32 ht 1: link 0: offset at 0 mask 0f00 shift 6 plus 0 \
eat match ip protocol 6 ff
tc filter delete dev eth0 parent ffff: protocol ip prio 100

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index f218ccf1e2d9..b2c3406a2cf2 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -398,6 +398,7 @@ static int u32_init(struct tcf_proto *tp)
 	rcu_assign_pointer(tp_c->hlist, root_ht);
 	root_ht->tp_c = tp_c;
 
+	root_ht->refcnt++;
 	rcu_assign_pointer(tp->root, root_ht);
 	tp->data = tp_c;
 	return 0;
@@ -610,7 +611,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 	struct tc_u_hnode __rcu **hn;
 	struct tc_u_hnode *phn;
 
-	WARN_ON(ht->refcnt);
+	WARN_ON(--ht->refcnt);
 
 	u32_clear_hnode(tp, ht, extack);
 
@@ -649,7 +650,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 
 	WARN_ON(root_ht == NULL);
 
-	if (root_ht && --root_ht->refcnt == 0)
+	if (root_ht && --root_ht->refcnt == 1)
 		u32_destroy_hnode(tp, root_ht, extack);
 
 	if (--tp_c->refcnt == 0) {
@@ -698,7 +699,6 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 	}
 
 	if (ht->refcnt == 1) {
-		ht->refcnt--;
 		u32_destroy_hnode(tp, ht, extack);
 	} else {
 		NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
@@ -708,11 +708,11 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 out:
 	*last = true;
 	if (root_ht) {
-		if (root_ht->refcnt > 1) {
+		if (root_ht->refcnt > 2) {
 			*last = false;
 			goto ret;
 		}
-		if (root_ht->refcnt == 1) {
+		if (root_ht->refcnt == 2) {
 			if (!ht_empty(root_ht)) {
 				*last = false;
 				goto ret;
-- 
cgit v1.2.3-59-g8ed1b


From 5271953cad31b97dea80f848c16e96ad66401199 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Oct 2018 11:10:51 +0100
Subject: rxrpc: Use the UDP encap_rcv hook

Use the UDP encap_rcv hook to cut the bit out of the rxrpc packet reception
in which a packet is placed onto the UDP receive queue and then immediately
removed again by rxrpc.  Going via the queue in this manner seems like it
should be unnecessary.

This does, however, require the invention of a value to place in encap_type
as that's one of the conditions to switch packets out to the encap_rcv
hook.  Possibly the value doesn't actually matter for anything other than
sockopts on the UDP socket, which aren't accessible outside of rxrpc
anyway.

This seems to cut a bit of time out of the time elapsed between each
sk_buff being timestamped and turning up in rxrpc (the final number in the
following trace excerpts).  I measured this by making the rxrpc_rx_packet
trace point print the time elapsed between the skb being timestamped and
the current time (in ns), e.g.:

	... 424.278721: rxrpc_rx_packet: ...  ACK 25026

So doing a 512MiB DIO read from my test server, with an unmodified kernel:

	N       min     max     sum		mean    stddev
	27605   2626    7581    7.83992e+07     2840.04 181.029

and with the patch applied:

	N       min     max     sum		mean    stddev
	27547   1895    12165   6.77461e+07     2459.29 255.02

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/uapi/linux/udp.h |  1 +
 net/rxrpc/ar-internal.h  |  2 +-
 net/rxrpc/input.c        | 50 ++++++++++++------------------------------------
 net/rxrpc/local_object.c | 27 +++++++++++++++++++++-----
 4 files changed, 36 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09d00f8c442b..09502de447f5 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -40,5 +40,6 @@ struct udphdr {
 #define UDP_ENCAP_L2TPINUDP	3 /* rfc2661 */
 #define UDP_ENCAP_GTP0		4 /* GSM TS 09.60 */
 #define UDP_ENCAP_GTP1U		5 /* 3GPP TS 29.060 */
+#define UDP_ENCAP_RXRPC		6
 
 #endif /* _UAPI_LINUX_UDP_H */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 63c43b3a2096..ab60c0313fd4 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -966,7 +966,7 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
 /*
  * input.c
  */
-void rxrpc_data_ready(struct sock *);
+int rxrpc_input_packet(struct sock *, struct sk_buff *);
 
 /*
  * insecure.c
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c3114fa66c92..1866aeef2284 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1121,7 +1121,7 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
  * shut down and the local endpoint from going away, thus sk_user_data will not
  * be cleared until this function returns.
  */
-void rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
+int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 {
 	struct rxrpc_connection *conn;
 	struct rxrpc_channel *chan;
@@ -1135,6 +1135,13 @@ void rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 
 	_enter("%p", udp_sk);
 
+	if (skb->tstamp == 0)
+		skb->tstamp = ktime_get_real();
+
+	rxrpc_new_skb(skb, rxrpc_skb_rx_received);
+
+	skb_pull(skb, sizeof(struct udphdr));
+
 	/* The UDP protocol already released all skb resources;
 	 * we are free to add our own data there.
 	 */
@@ -1148,8 +1155,8 @@ void rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 		static int lose;
 		if ((lose++ & 7) == 7) {
 			trace_rxrpc_rx_lose(sp);
-			rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
-			return;
+			rxrpc_free_skb(skb, rxrpc_skb_rx_lost);
+			return 0;
 		}
 	}
 
@@ -1332,7 +1339,7 @@ discard:
 	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 out:
 	trace_rxrpc_rx_done(0, 0);
-	return;
+	return 0;
 
 out_unlock:
 	rcu_read_unlock();
@@ -1371,38 +1378,5 @@ reject_packet:
 	trace_rxrpc_rx_done(skb->mark, skb->priority);
 	rxrpc_reject_packet(local, skb);
 	_leave(" [badmsg]");
-}
-
-void rxrpc_data_ready(struct sock *udp_sk)
-{
-	struct sk_buff *skb;
-	int ret;
-
-	for (;;) {
-		skb = skb_recv_udp(udp_sk, 0, 1, &ret);
-		if (!skb) {
-			if (ret == -EAGAIN)
-				return;
-
-			/* If there was a transmission failure, we get an error
-			 * here that we need to ignore.
-			 */
-			_debug("UDP socket error %d", ret);
-			continue;
-		}
-
-		rxrpc_new_skb(skb, rxrpc_skb_rx_received);
-
-		/* we'll probably need to checksum it (didn't call sock_recvmsg) */
-		if (skb_checksum_complete(skb)) {
-			rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
-			__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INERRORS, 0);
-			_debug("csum failed");
-			continue;
-		}
-
-		__UDP_INC_STATS(sock_net(udp_sk), UDP_MIB_INDATAGRAMS, 0);
-
-		rxrpc_input_packet(udp_sk, skb);
-	}
+	return 0;
 }
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 30862f44c9f1..cad0691c2bb4 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -19,6 +19,7 @@
 #include <linux/ip.h>
 #include <linux/hashtable.h>
 #include <net/sock.h>
+#include <net/udp.h>
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
@@ -108,7 +109,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
  */
 static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 {
-	struct sock *sock;
+	struct sock *usk;
 	int ret, opt;
 
 	_enter("%p{%d,%d}",
@@ -123,10 +124,26 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
 	}
 
 	/* set the socket up */
-	sock = local->socket->sk;
-	sock->sk_user_data	= local;
-	sock->sk_data_ready	= rxrpc_data_ready;
-	sock->sk_error_report	= rxrpc_error_report;
+	usk = local->socket->sk;
+	inet_sk(usk)->mc_loop = 0;
+
+	/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+	inet_inc_convert_csum(usk);
+
+	rcu_assign_sk_user_data(usk, local);
+
+	udp_sk(usk)->encap_type = UDP_ENCAP_RXRPC;
+	udp_sk(usk)->encap_rcv = rxrpc_input_packet;
+	udp_sk(usk)->encap_destroy = NULL;
+	udp_sk(usk)->gro_receive = NULL;
+	udp_sk(usk)->gro_complete = NULL;
+
+	udp_encap_enable();
+#if IS_ENABLED(CONFIG_IPV6)
+	if (local->srx.transport.family == AF_INET6)
+		udpv6_encap_enable();
+#endif
+	usk->sk_error_report = rxrpc_error_report;
 
 	/* if a local address was supplied then bind it */
 	if (local->srx.transport_len > sizeof(sa_family_t)) {
-- 
cgit v1.2.3-59-g8ed1b


From bfd2821117a751763718f1b5e57216c0d9b19a49 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 8 Oct 2018 15:45:56 +0100
Subject: rxrpc: Don't need to take the RCU read lock in the packet receiver

We don't need to take the RCU read lock in the rxrpc packet receive
function because it's held further up the stack in the IP input routine
around the UDP receive routines.

Fix this by dropping the RCU read lock calls from rxrpc_input_packet().
This simplifies the code.

Fixes: 70790dbe3f66 ("rxrpc: Pass the last Tx packet marker in the annotation buffer")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 41 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1866aeef2284..2dcef482acf2 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1120,6 +1120,8 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
  * The socket is locked by the caller and this prevents the socket from being
  * shut down and the local endpoint from going away, thus sk_user_data will not
  * be cleared until this function returns.
+ *
+ * Called with the RCU read lock held from the IP layer via UDP.
  */
 int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 {
@@ -1215,8 +1217,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 	if (sp->hdr.serviceId == 0)
 		goto bad_message;
 
-	rcu_read_lock();
-
 	if (rxrpc_to_server(sp)) {
 		/* Weed out packets to services we're not offering.  Packets
 		 * that would begin a call are explicitly rejected and the rest
@@ -1228,7 +1228,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 			if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
 			    sp->hdr.seq == 1)
 				goto unsupported_service;
-			goto discard_unlock;
+			goto discard;
 		}
 	}
 
@@ -1248,7 +1248,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 			/* Connection-level packet */
 			_debug("CONN %p {%d}", conn, conn->debug_id);
 			rxrpc_post_packet_to_conn(conn, skb);
-			goto out_unlock;
+			goto out;
 		}
 
 		/* Note the serial number skew here */
@@ -1267,19 +1267,19 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 
 		/* Ignore really old calls */
 		if (sp->hdr.callNumber < chan->last_call)
-			goto discard_unlock;
+			goto discard;
 
 		if (sp->hdr.callNumber == chan->last_call) {
 			if (chan->call ||
 			    sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)
-				goto discard_unlock;
+				goto discard;
 
 			/* For the previous service call, if completed
 			 * successfully, we discard all further packets.
 			 */
 			if (rxrpc_conn_is_service(conn) &&
 			    chan->last_type == RXRPC_PACKET_TYPE_ACK)
-				goto discard_unlock;
+				goto discard;
 
 			/* But otherwise we need to retransmit the final packet
 			 * from data cached in the connection record.
@@ -1290,16 +1290,14 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 						    sp->hdr.serial,
 						    sp->hdr.flags, 0);
 			rxrpc_post_packet_to_conn(conn, skb);
-			goto out_unlock;
+			goto out;
 		}
 
 		call = rcu_dereference(chan->call);
 
 		if (sp->hdr.callNumber > chan->call_id) {
-			if (rxrpc_to_client(sp)) {
-				rcu_read_unlock();
+			if (rxrpc_to_client(sp))
 				goto reject_packet;
-			}
 			if (call)
 				rxrpc_input_implicit_end_call(conn, call);
 			call = NULL;
@@ -1318,55 +1316,42 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 	if (!call || atomic_read(&call->usage) == 0) {
 		if (rxrpc_to_client(sp) ||
 		    sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
-			goto bad_message_unlock;
+			goto bad_message;
 		if (sp->hdr.seq != 1)
-			goto discard_unlock;
+			goto discard;
 		call = rxrpc_new_incoming_call(local, rx, peer, conn, skb);
-		if (!call) {
-			rcu_read_unlock();
+		if (!call)
 			goto reject_packet;
-		}
 		rxrpc_send_ping(call, skb, skew);
 		mutex_unlock(&call->user_mutex);
 	}
 
 	rxrpc_input_call_packet(call, skb, skew);
-	goto discard_unlock;
+	goto discard;
 
-discard_unlock:
-	rcu_read_unlock();
 discard:
 	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 out:
 	trace_rxrpc_rx_done(0, 0);
 	return 0;
 
-out_unlock:
-	rcu_read_unlock();
-	goto out;
-
 wrong_security:
-	rcu_read_unlock();
 	trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RXKADINCONSISTENCY, EBADMSG);
 	skb->priority = RXKADINCONSISTENCY;
 	goto post_abort;
 
 unsupported_service:
-	rcu_read_unlock();
 	trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RX_INVALID_OPERATION, EOPNOTSUPP);
 	skb->priority = RX_INVALID_OPERATION;
 	goto post_abort;
 
 reupgrade:
-	rcu_read_unlock();
 	trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RX_PROTOCOL_ERROR, EBADMSG);
 	goto protocol_error;
 
-bad_message_unlock:
-	rcu_read_unlock();
 bad_message:
 	trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RX_PROTOCOL_ERROR, EBADMSG);
-- 
cgit v1.2.3-59-g8ed1b


From c479d5f2c2e1ce609da08c075054440d97ddff52 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 8 Oct 2018 15:46:01 +0100
Subject: rxrpc: Don't check RXRPC_CALL_TX_LAST after calling
 rxrpc_rotate_tx_window()

We should only call the function to end a call's Tx phase if we rotated the
marked-last packet out of the transmission buffer.

Make rxrpc_rotate_tx_window() return an indication of whether it just
rotated the packet marked as the last out of the transmit buffer, carrying
the information out of the locked section in that function.

We can then check the return value instead of examining RXRPC_CALL_TX_LAST.

Fixes: 70790dbe3f66 ("rxrpc: Pass the last Tx packet marker in the annotation buffer")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 2dcef482acf2..8834aa627371 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -216,10 +216,11 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
 /*
  * Apply a hard ACK by advancing the Tx window.
  */
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
+static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 				   struct rxrpc_ack_summary *summary)
 {
 	struct sk_buff *skb, *list = NULL;
+	bool rot_last = false;
 	int ix;
 	u8 annotation;
 
@@ -243,15 +244,17 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 		skb->next = list;
 		list = skb;
 
-		if (annotation & RXRPC_TX_ANNO_LAST)
+		if (annotation & RXRPC_TX_ANNO_LAST) {
 			set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+			rot_last = true;
+		}
 		if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
 			summary->nr_rot_new_acks++;
 	}
 
 	spin_unlock(&call->lock);
 
-	trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
+	trace_rxrpc_transmit(call, (rot_last ?
 				    rxrpc_transmit_rotate_last :
 				    rxrpc_transmit_rotate));
 	wake_up(&call->waitq);
@@ -262,6 +265,8 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 		skb->next = NULL;
 		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	}
+
+	return rot_last;
 }
 
 /*
@@ -332,11 +337,11 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 		trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
 	}
 
-	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
-		rxrpc_rotate_tx_window(call, top, &summary);
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
-		rxrpc_proto_abort("TXL", call, top);
-		return false;
+		if (!rxrpc_rotate_tx_window(call, top, &summary)) {
+			rxrpc_proto_abort("TXL", call, top);
+			return false;
+		}
 	}
 	if (!rxrpc_end_tx_phase(call, true, "ETD"))
 		return false;
@@ -897,8 +902,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	if (nr_acks > call->tx_top - hard_ack)
 		return rxrpc_proto_abort("AKN", call, 0);
 
-	if (after(hard_ack, call->tx_hard_ack))
-		rxrpc_rotate_tx_window(call, hard_ack, &summary);
+	if (after(hard_ack, call->tx_hard_ack)) {
+		if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
+			rxrpc_end_tx_phase(call, false, "ETA");
+			return;
+		}
+	}
 
 	if (nr_acks > 0) {
 		if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0)
@@ -907,11 +916,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 				      &summary);
 	}
 
-	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
-		rxrpc_end_tx_phase(call, false, "ETA");
-		return;
-	}
-
 	if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
 	    RXRPC_TX_ANNO_LAST &&
 	    summary.nr_acks == call->tx_top - hard_ack &&
@@ -933,8 +937,7 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 
 	_proto("Rx ACKALL %%%u", sp->hdr.serial);
 
-	rxrpc_rotate_tx_window(call, call->tx_top, &summary);
-	if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+	if (rxrpc_rotate_tx_window(call, call->tx_top, &summary))
 		rxrpc_end_tx_phase(call, false, "ETL");
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From dfe995224693798e554ab4770f6d8a096afc60cd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 8 Oct 2018 15:46:05 +0100
Subject: rxrpc: Carry call state out of locked section in
 rxrpc_rotate_tx_window()

Carry the call state out of the locked section in rxrpc_rotate_tx_window()
rather than sampling it afterwards.  This is only used to select tracepoint
data, but could have changed by the time we do the tracepoint.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 8834aa627371..af8ce64f4162 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -278,23 +278,26 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
 			       const char *abort_why)
 {
+	unsigned int state;
 
 	ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
 
 	write_lock(&call->state_lock);
 
-	switch (call->state) {
+	state = call->state;
+	switch (state) {
 	case RXRPC_CALL_CLIENT_SEND_REQUEST:
 	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
 		if (reply_begun)
-			call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+			call->state = state = RXRPC_CALL_CLIENT_RECV_REPLY;
 		else
-			call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+			call->state = state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
 		break;
 
 	case RXRPC_CALL_SERVER_AWAIT_ACK:
 		__rxrpc_call_completed(call);
 		rxrpc_notify_socket(call);
+		state = call->state;
 		break;
 
 	default:
@@ -302,11 +305,10 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
 	}
 
 	write_unlock(&call->state_lock);
-	if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+	if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
 		trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
-	} else {
+	else
 		trace_rxrpc_transmit(call, rxrpc_transmit_end);
-	}
 	_leave(" = ok");
 	return true;
 
-- 
cgit v1.2.3-59-g8ed1b


From 298bc15b2079c324e82d0a6fda39c3d762af7282 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 8 Oct 2018 15:46:11 +0100
Subject: rxrpc: Only take the rwind and mtu values from latest ACK

Move the out-of-order and duplicate ACK packet check to before the call to
rxrpc_input_ackinfo() so that the receive window size and MTU size are only
checked in the latest ACK packet and don't regress.

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index af8ce64f4162..04213a65c1ac 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -868,6 +868,16 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 				  rxrpc_propose_ack_respond_to_ack);
 	}
 
+	/* Discard any out-of-order or duplicate ACKs. */
+	if (before_eq(sp->hdr.serial, call->acks_latest)) {
+		_debug("discard ACK %d <= %d",
+		       sp->hdr.serial, call->acks_latest);
+		return;
+	}
+	call->acks_latest_ts = skb->tstamp;
+	call->acks_latest = sp->hdr.serial;
+
+	/* Parse rwind and mtu sizes if provided. */
 	ioffset = offset + nr_acks + 3;
 	if (skb->len >= ioffset + sizeof(buf.info)) {
 		if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
@@ -889,15 +899,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 		return;
 	}
 
-	/* Discard any out-of-order or duplicate ACKs. */
-	if (before_eq(sp->hdr.serial, call->acks_latest)) {
-		_debug("discard ACK %d <= %d",
-		       sp->hdr.serial, call->acks_latest);
-		return;
-	}
-	call->acks_latest_ts = skb->tstamp;
-	call->acks_latest = sp->hdr.serial;
-
 	if (before(hard_ack, call->tx_hard_ack) ||
 	    after(hard_ack, call->tx_top))
 		return rxrpc_proto_abort("AKW", call, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 647530924f47c93db472ee3cf43b7ef1425581b6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 8 Oct 2018 15:46:17 +0100
Subject: rxrpc: Fix connection-level abort handling

Fix connection-level abort handling to cache the abort and error codes
properly so that a new incoming call can be properly aborted if it races
with the parent connection being aborted by another CPU.

The abort_code and error parameters can then be dropped from
rxrpc_abort_calls().

Fixes: f5c17aaeb2ae ("rxrpc: Calls should only have one terminal state")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  4 ++--
 net/rxrpc/call_accept.c |  4 ++--
 net/rxrpc/conn_event.c  | 26 +++++++++++++++-----------
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ab60c0313fd4..45307463b7dd 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -442,8 +442,7 @@ struct rxrpc_connection {
 	spinlock_t		state_lock;	/* state-change lock */
 	enum rxrpc_conn_cache_state cache_state;
 	enum rxrpc_conn_proto_state state;	/* current state of connection */
-	u32			local_abort;	/* local abort code */
-	u32			remote_abort;	/* remote abort code */
+	u32			abort_code;	/* Abort code of connection abort */
 	int			debug_id;	/* debug ID for printks */
 	atomic_t		serial;		/* packet serial number counter */
 	unsigned int		hi_serial;	/* highest serial number received */
@@ -453,6 +452,7 @@ struct rxrpc_connection {
 	u8			security_size;	/* security header size */
 	u8			security_ix;	/* security type */
 	u8			out_clientflag;	/* RXRPC_CLIENT_INITIATED if we are client */
+	short			error;		/* Local error code */
 };
 
 static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp)
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index f55f67894465..1c4ebc0cb25b 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -405,11 +405,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 
 	case RXRPC_CONN_REMOTELY_ABORTED:
 		rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
-					  conn->remote_abort, -ECONNABORTED);
+					  conn->abort_code, conn->error);
 		break;
 	case RXRPC_CONN_LOCALLY_ABORTED:
 		rxrpc_abort_call("CON", call, sp->hdr.seq,
-				 conn->local_abort, -ECONNABORTED);
+				 conn->abort_code, conn->error);
 		break;
 	default:
 		BUG();
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 6df56ce68861..b6fca8ebb117 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -126,7 +126,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 
 	switch (chan->last_type) {
 	case RXRPC_PACKET_TYPE_ABORT:
-		_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
+		_proto("Tx ABORT %%%u { %d } [re]", serial, conn->abort_code);
 		break;
 	case RXRPC_PACKET_TYPE_ACK:
 		trace_rxrpc_tx_ack(chan->call_debug_id, serial,
@@ -153,13 +153,12 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
  * pass a connection-level abort onto all calls on that connection
  */
 static void rxrpc_abort_calls(struct rxrpc_connection *conn,
-			      enum rxrpc_call_completion compl,
-			      u32 abort_code, int error)
+			      enum rxrpc_call_completion compl)
 {
 	struct rxrpc_call *call;
 	int i;
 
-	_enter("{%d},%x", conn->debug_id, abort_code);
+	_enter("{%d},%x", conn->debug_id, conn->abort_code);
 
 	spin_lock(&conn->channel_lock);
 
@@ -172,9 +171,11 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 				trace_rxrpc_abort(call->debug_id,
 						  "CON", call->cid,
 						  call->call_id, 0,
-						  abort_code, error);
+						  conn->abort_code,
+						  conn->error);
 			if (rxrpc_set_call_completion(call, compl,
-						      abort_code, error))
+						      conn->abort_code,
+						      conn->error))
 				rxrpc_notify_socket(call);
 		}
 	}
@@ -207,10 +208,12 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 		return 0;
 	}
 
+	conn->error = error;
+	conn->abort_code = abort_code;
 	conn->state = RXRPC_CONN_LOCALLY_ABORTED;
 	spin_unlock_bh(&conn->state_lock);
 
-	rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error);
+	rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED);
 
 	msg.msg_name	= &conn->params.peer->srx.transport;
 	msg.msg_namelen	= conn->params.peer->srx.transport_len;
@@ -229,7 +232,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 	whdr._rsvd	= 0;
 	whdr.serviceId	= htons(conn->service_id);
 
-	word		= htonl(conn->local_abort);
+	word		= htonl(conn->abort_code);
 
 	iov[0].iov_base	= &whdr;
 	iov[0].iov_len	= sizeof(whdr);
@@ -240,7 +243,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 
 	serial = atomic_inc_return(&conn->serial);
 	whdr.serial = htonl(serial);
-	_proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort);
+	_proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code);
 
 	ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
 	if (ret < 0) {
@@ -315,9 +318,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 		abort_code = ntohl(wtmp);
 		_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
 
+		conn->error = -ECONNABORTED;
+		conn->abort_code = abort_code;
 		conn->state = RXRPC_CONN_REMOTELY_ABORTED;
-		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
-				  abort_code, -ECONNABORTED);
+		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED);
 		return -ECONNABORTED;
 
 	case RXRPC_PACKET_TYPE_CHALLENGE:
-- 
cgit v1.2.3-59-g8ed1b


From c1e15b4944c9fa7fbbb74f7a5920a1e31b4b965a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 8 Oct 2018 15:46:25 +0100
Subject: rxrpc: Fix the packet reception routine

The rxrpc_input_packet() function and its call tree was built around the
assumption that data_ready() handler called from UDP to inform a kernel
service that there is data to be had was non-reentrant.  This means that
certain locking could be dispensed with.

This, however, turns out not to be the case with a multi-queue network card
that can deliver packets to multiple cpus simultaneously.  Each of those
cpus can be in the rxrpc_input_packet() function at the same time.

Fix by adding or changing some structure members:

 (1) Add peer->rtt_input_lock to serialise access to the RTT buffer.

 (2) Make conn->service_id into a 32-bit variable so that it can be
     cmpxchg'd on all arches.

 (3) Add call->input_lock to serialise access to the Rx/Tx state.  Note
     that although the Rx and Tx states are (almost) entirely separate,
     there's no point completing the separation and having separate locks
     since it's a bi-phasal RPC protocol rather than a bi-direction
     streaming protocol.  Data transmission and data reception do not take
     place simultaneously on any particular call.

and making the following functional changes:

 (1) In rxrpc_input_data(), hold call->input_lock around the core to
     prevent simultaneous producing of packets into the Rx ring and
     updating of tracking state for a particular call.

 (2) In rxrpc_input_ping_response(), only read call->ping_serial once, and
     check it before checking RXRPC_CALL_PINGING as that's a cheaper test.
     The bit test and bit clear can then be combined.  No further locking
     is needed here.

 (3) In rxrpc_input_ack(), take call->input_lock after we've parsed much of
     the ACK packet.  The superseded ACK check is then done both before and
     after the lock is taken.

     The handing of ackinfo data is split, parsing before the lock is taken
     and processing with it held.  This is keyed on rxMTU being non-zero.

     Congestion management is also done within the locked section.

 (4) In rxrpc_input_ackall(), take call->input_lock around the Tx window
     rotation.  The ACKALL packet carries no information and is only really
     useful after all packets have been transmitted since it's imprecise.

 (5) In rxrpc_input_implicit_end_call(), we use rx->incoming_lock to
     prevent calls being simultaneously implicitly ended on two cpus and
     also to prevent any races with incoming call setup.

 (6) In rxrpc_input_packet(), use cmpxchg() to effect the service upgrade
     on a connection.  It is only permitted to happen once for a
     connection.

 (7) In rxrpc_new_incoming_call(), we have to recheck the routing inside
     rx->incoming_lock to see if someone else set up the call, connection
     or peer whilst we were getting there.  We can't trust the values from
     the earlier routing check unless we pin refs on them - which we want
     to avoid.

     Further, we need to allow for an incoming call to have its state
     changed on another CPU between us making it live and us adjusting it
     because the conn is now in the RXRPC_CONN_SERVICE state.

 (8) In rxrpc_peer_add_rtt(), take peer->rtt_input_lock around the access
     to the RTT buffer.  Don't need to lock around setting peer->rtt.

For reference, the inventory of state-accessing or state-altering functions
used by the packet input procedure is:

> rxrpc_input_packet()
  * PACKET CHECKING

  * ROUTING
    > rxrpc_post_packet_to_local()
    > rxrpc_find_connection_rcu() - uses RCU
      > rxrpc_lookup_peer_rcu() - uses RCU
      > rxrpc_find_service_conn_rcu() - uses RCU
      > idr_find() - uses RCU

  * CONNECTION-LEVEL PROCESSING
    - Service upgrade
      - Can only happen once per conn
      ! Changed to use cmpxchg
    > rxrpc_post_packet_to_conn()
    - Setting conn->hi_serial
      - Probably safe not using locks
      - Maybe use cmpxchg

  * CALL-LEVEL PROCESSING
    > Old-call checking
      > rxrpc_input_implicit_end_call()
        > rxrpc_call_completed()
	> rxrpc_queue_call()
	! Need to take rx->incoming_lock
	> __rxrpc_disconnect_call()
	> rxrpc_notify_socket()
    > rxrpc_new_incoming_call()
      - Uses rx->incoming_lock for the entire process
        - Might be able to drop this earlier in favour of the call lock
      > rxrpc_incoming_call()
      	! Conflicts with rxrpc_input_implicit_end_call()
    > rxrpc_send_ping()
      - Don't need locks to check rtt state
      > rxrpc_propose_ACK

  * PACKET DISTRIBUTION
    > rxrpc_input_call_packet()
      > rxrpc_input_data()
	* QUEUE DATA PACKET ON CALL
	> rxrpc_reduce_call_timer()
	  - Uses timer_reduce()
	! Needs call->input_lock()
	> rxrpc_receiving_reply()
	  ! Needs locking around ack state
	  > rxrpc_rotate_tx_window()
	  > rxrpc_end_tx_phase()
	> rxrpc_proto_abort()
	> rxrpc_input_dup_data()
	- Fills the Rx buffer
	- rxrpc_propose_ACK()
	- rxrpc_notify_socket()

      > rxrpc_input_ack()
	* APPLY ACK PACKET TO CALL AND DISCARD PACKET
	> rxrpc_input_ping_response()
	  - Probably doesn't need any extra locking
	  ! Need READ_ONCE() on call->ping_serial
	  > rxrpc_input_check_for_lost_ack()
	    - Takes call->lock to consult Tx buffer
	  > rxrpc_peer_add_rtt()
	    ! Needs to take a lock (peer->rtt_input_lock)
	    ! Could perhaps manage with cmpxchg() and xadd() instead
	> rxrpc_input_requested_ack
	  - Consults Tx buffer
	    ! Probably needs a lock
	  > rxrpc_peer_add_rtt()
	> rxrpc_propose_ack()
	> rxrpc_input_ackinfo()
	  - Changes call->tx_winsize
	    ! Use cmpxchg to handle change
	    ! Should perhaps track serial number
	  - Uses peer->lock to record MTU specification changes
	> rxrpc_proto_abort()
	! Need to take call->input_lock
	> rxrpc_rotate_tx_window()
	> rxrpc_end_tx_phase()
	> rxrpc_input_soft_acks()
	- Consults the Tx buffer
	> rxrpc_congestion_management()
	  - Modifies the Tx annotations
	  ! Needs call->input_lock()
	  > rxrpc_queue_call()

      > rxrpc_input_abort()
	* APPLY ABORT PACKET TO CALL AND DISCARD PACKET
	> rxrpc_set_call_completion()
	> rxrpc_notify_socket()

      > rxrpc_input_ackall()
	* APPLY ACKALL PACKET TO CALL AND DISCARD PACKET
	! Need to take call->input_lock
	> rxrpc_rotate_tx_window()
	> rxrpc_end_tx_phase()

    > rxrpc_reject_packet()

There are some functions used by the above that queue the packet, after
which the procedure is terminated:

 - rxrpc_post_packet_to_local()
   - local->event_queue is an sk_buff_head
   - local->processor is a work_struct
 - rxrpc_post_packet_to_conn()
   - conn->rx_queue is an sk_buff_head
   - conn->processor is a work_struct
 - rxrpc_reject_packet()
   - local->reject_queue is an sk_buff_head
   - local->processor is a work_struct

And some that offload processing to process context:

 - rxrpc_notify_socket()
   - Uses RCU lock
   - Uses call->notify_lock to call call->notify_rx
   - Uses call->recvmsg_lock to queue recvmsg side
 - rxrpc_queue_call()
   - call->processor is a work_struct
 - rxrpc_propose_ACK()
   - Uses call->lock to wrap __rxrpc_propose_ACK()

And a bunch that complete a call, all of which use call->state_lock to
protect the call state:

 - rxrpc_call_completed()
 - rxrpc_set_call_completion()
 - rxrpc_abort_call()
 - rxrpc_proto_abort()
   - Also uses rxrpc_queue_call()

Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |   7 +--
 net/rxrpc/call_accept.c |  21 ++++++---
 net/rxrpc/call_object.c |   1 +
 net/rxrpc/input.c       | 120 +++++++++++++++++++++++++++++++-----------------
 net/rxrpc/peer_event.c  |   5 ++
 net/rxrpc/peer_object.c |   1 +
 6 files changed, 105 insertions(+), 50 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 45307463b7dd..a6e6cae82c30 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -302,6 +302,7 @@ struct rxrpc_peer {
 
 	/* calculated RTT cache */
 #define RXRPC_RTT_CACHE_SIZE 32
+	spinlock_t		rtt_input_lock;	/* RTT lock for input routine */
 	ktime_t			rtt_last_req;	/* Time of last RTT request */
 	u64			rtt;		/* Current RTT estimate (in nS) */
 	u64			rtt_sum;	/* Sum of cache contents */
@@ -447,7 +448,7 @@ struct rxrpc_connection {
 	atomic_t		serial;		/* packet serial number counter */
 	unsigned int		hi_serial;	/* highest serial number received */
 	u32			security_nonce;	/* response re-use preventer */
-	u16			service_id;	/* Service ID, possibly upgraded */
+	u32			service_id;	/* Service ID, possibly upgraded */
 	u8			size_align;	/* data size alignment (for security) */
 	u8			security_size;	/* security header size */
 	u8			security_ix;	/* security type */
@@ -635,6 +636,8 @@ struct rxrpc_call {
 	bool			tx_phase;	/* T if transmission phase, F if receive phase */
 	u8			nr_jumbo_bad;	/* Number of jumbo dups/exceeds-windows */
 
+	spinlock_t		input_lock;	/* Lock for packet input to this call */
+
 	/* receive-phase ACK management */
 	u8			ackr_reason;	/* reason to ACK */
 	u16			ackr_skew;	/* skew on packet being ACK'd */
@@ -720,8 +723,6 @@ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
 void rxrpc_discard_prealloc(struct rxrpc_sock *);
 struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
 					   struct rxrpc_sock *,
-					   struct rxrpc_peer *,
-					   struct rxrpc_connection *,
 					   struct sk_buff *);
 void rxrpc_accept_incoming_calls(struct rxrpc_local *);
 struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 1c4ebc0cb25b..652e314de38e 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -333,11 +333,11 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
  */
 struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 					   struct rxrpc_sock *rx,
-					   struct rxrpc_peer *peer,
-					   struct rxrpc_connection *conn,
 					   struct sk_buff *skb)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_connection *conn;
+	struct rxrpc_peer *peer;
 	struct rxrpc_call *call;
 
 	_enter("");
@@ -354,6 +354,13 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 		goto out;
 	}
 
+	/* The peer, connection and call may all have sprung into existence due
+	 * to a duplicate packet being handled on another CPU in parallel, so
+	 * we have to recheck the routing.  However, we're now holding
+	 * rx->incoming_lock, so the values should remain stable.
+	 */
+	conn = rxrpc_find_connection_rcu(local, skb, &peer);
+
 	call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb);
 	if (!call) {
 		skb->mark = RXRPC_SKB_MARK_REJECT_BUSY;
@@ -396,10 +403,12 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 
 	case RXRPC_CONN_SERVICE:
 		write_lock(&call->state_lock);
-		if (rx->discard_new_call)
-			call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
-		else
-			call->state = RXRPC_CALL_SERVER_ACCEPTING;
+		if (call->state < RXRPC_CALL_COMPLETE) {
+			if (rx->discard_new_call)
+				call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+			else
+				call->state = RXRPC_CALL_SERVER_ACCEPTING;
+		}
 		write_unlock(&call->state_lock);
 		break;
 
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 0ca2c2dfd196..8f1a8f85b1f9 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -138,6 +138,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
 	init_waitqueue_head(&call->waitq);
 	spin_lock_init(&call->lock);
 	spin_lock_init(&call->notify_lock);
+	spin_lock_init(&call->input_lock);
 	rwlock_init(&call->state_lock);
 	atomic_set(&call->usage, 1);
 	call->debug_id = debug_id;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 04213a65c1ac..570b49d2da42 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -459,13 +459,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 		}
 	}
 
+	spin_lock(&call->input_lock);
+
 	/* Received data implicitly ACKs all of the request packets we sent
 	 * when we're acting as a client.
 	 */
 	if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
 	     state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
 	    !rxrpc_receiving_reply(call))
-		return;
+		goto unlock;
 
 	call->ackr_prev_seq = seq;
 
@@ -495,12 +497,16 @@ next_subpacket:
 
 	if (flags & RXRPC_LAST_PACKET) {
 		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
-		    seq != call->rx_top)
-			return rxrpc_proto_abort("LSN", call, seq);
+		    seq != call->rx_top) {
+			rxrpc_proto_abort("LSN", call, seq);
+			goto unlock;
+		}
 	} else {
 		if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
-		    after_eq(seq, call->rx_top))
-			return rxrpc_proto_abort("LSA", call, seq);
+		    after_eq(seq, call->rx_top)) {
+			rxrpc_proto_abort("LSA", call, seq);
+			goto unlock;
+		}
 	}
 
 	trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
@@ -567,8 +573,10 @@ next_subpacket:
 skip:
 	offset += len;
 	if (flags & RXRPC_JUMBO_PACKET) {
-		if (skb_copy_bits(skb, offset, &flags, 1) < 0)
-			return rxrpc_proto_abort("XJF", call, seq);
+		if (skb_copy_bits(skb, offset, &flags, 1) < 0) {
+			rxrpc_proto_abort("XJF", call, seq);
+			goto unlock;
+		}
 		offset += sizeof(struct rxrpc_jumbo_header);
 		seq++;
 		serial++;
@@ -608,6 +616,9 @@ ack:
 		trace_rxrpc_notify_socket(call->debug_id, serial);
 		rxrpc_notify_socket(call);
 	}
+
+unlock:
+	spin_unlock(&call->input_lock);
 	_leave(" [queued]");
 }
 
@@ -694,15 +705,14 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call,
 
 	ping_time = call->ping_time;
 	smp_rmb();
-	ping_serial = call->ping_serial;
+	ping_serial = READ_ONCE(call->ping_serial);
 
 	if (orig_serial == call->acks_lost_ping)
 		rxrpc_input_check_for_lost_ack(call);
 
-	if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
-	    before(orig_serial, ping_serial))
+	if (before(orig_serial, ping_serial) ||
+	    !test_and_clear_bit(RXRPC_CALL_PINGING, &call->flags))
 		return;
-	clear_bit(RXRPC_CALL_PINGING, &call->flags);
 	if (after(orig_serial, ping_serial))
 		return;
 
@@ -869,24 +879,31 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 
 	/* Discard any out-of-order or duplicate ACKs. */
-	if (before_eq(sp->hdr.serial, call->acks_latest)) {
-		_debug("discard ACK %d <= %d",
-		       sp->hdr.serial, call->acks_latest);
+	if (before_eq(sp->hdr.serial, call->acks_latest))
 		return;
-	}
+
+	buf.info.rxMTU = 0;
+	ioffset = offset + nr_acks + 3;
+	if (skb->len >= ioffset + sizeof(buf.info) &&
+	    skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
+		return rxrpc_proto_abort("XAI", call, 0);
+
+	spin_lock(&call->input_lock);
+
+	/* Discard any out-of-order or duplicate ACKs. */
+	if (before_eq(sp->hdr.serial, call->acks_latest))
+		goto out;
 	call->acks_latest_ts = skb->tstamp;
 	call->acks_latest = sp->hdr.serial;
 
 	/* Parse rwind and mtu sizes if provided. */
-	ioffset = offset + nr_acks + 3;
-	if (skb->len >= ioffset + sizeof(buf.info)) {
-		if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
-			return rxrpc_proto_abort("XAI", call, 0);
+	if (buf.info.rxMTU)
 		rxrpc_input_ackinfo(call, skb, &buf.info);
-	}
 
-	if (first_soft_ack == 0)
-		return rxrpc_proto_abort("AK0", call, 0);
+	if (first_soft_ack == 0) {
+		rxrpc_proto_abort("AK0", call, 0);
+		goto out;
+	}
 
 	/* Ignore ACKs unless we are or have just been transmitting. */
 	switch (READ_ONCE(call->state)) {
@@ -896,25 +913,31 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 	case RXRPC_CALL_SERVER_AWAIT_ACK:
 		break;
 	default:
-		return;
+		goto out;
 	}
 
 	if (before(hard_ack, call->tx_hard_ack) ||
-	    after(hard_ack, call->tx_top))
-		return rxrpc_proto_abort("AKW", call, 0);
-	if (nr_acks > call->tx_top - hard_ack)
-		return rxrpc_proto_abort("AKN", call, 0);
+	    after(hard_ack, call->tx_top)) {
+		rxrpc_proto_abort("AKW", call, 0);
+		goto out;
+	}
+	if (nr_acks > call->tx_top - hard_ack) {
+		rxrpc_proto_abort("AKN", call, 0);
+		goto out;
+	}
 
 	if (after(hard_ack, call->tx_hard_ack)) {
 		if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
 			rxrpc_end_tx_phase(call, false, "ETA");
-			return;
+			goto out;
 		}
 	}
 
 	if (nr_acks > 0) {
-		if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0)
-			return rxrpc_proto_abort("XSA", call, 0);
+		if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) {
+			rxrpc_proto_abort("XSA", call, 0);
+			goto out;
+		}
 		rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
 				      &summary);
 	}
@@ -927,7 +950,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
 				  false, true,
 				  rxrpc_propose_ack_ping_for_lost_reply);
 
-	return rxrpc_congestion_management(call, skb, &summary, acked_serial);
+	rxrpc_congestion_management(call, skb, &summary, acked_serial);
+out:
+	spin_unlock(&call->input_lock);
 }
 
 /*
@@ -940,8 +965,12 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
 
 	_proto("Rx ACKALL %%%u", sp->hdr.serial);
 
+	spin_lock(&call->input_lock);
+
 	if (rxrpc_rotate_tx_window(call, call->tx_top, &summary))
 		rxrpc_end_tx_phase(call, false, "ETL");
+
+	spin_unlock(&call->input_lock);
 }
 
 /*
@@ -1024,18 +1053,19 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 }
 
 /*
- * Handle a new call on a channel implicitly completing the preceding call on
- * that channel.
+ * Handle a new service call on a channel implicitly completing the preceding
+ * call on that channel.  This does not apply to client conns.
  *
  * TODO: If callNumber > call_id + 1, renegotiate security.
  */
-static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
+static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx,
+					  struct rxrpc_connection *conn,
 					  struct rxrpc_call *call)
 {
 	switch (READ_ONCE(call->state)) {
 	case RXRPC_CALL_SERVER_AWAIT_ACK:
 		rxrpc_call_completed(call);
-		break;
+		/* Fall through */
 	case RXRPC_CALL_COMPLETE:
 		break;
 	default:
@@ -1043,11 +1073,13 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
 			set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 			rxrpc_queue_call(call);
 		}
+		trace_rxrpc_improper_term(call);
 		break;
 	}
 
-	trace_rxrpc_improper_term(call);
+	spin_lock(&rx->incoming_lock);
 	__rxrpc_disconnect_call(conn, call);
+	spin_unlock(&rx->incoming_lock);
 	rxrpc_notify_socket(call);
 }
 
@@ -1244,10 +1276,16 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 			goto wrong_security;
 
 		if (sp->hdr.serviceId != conn->service_id) {
-			if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) ||
-			    conn->service_id != conn->params.service_id)
+			int old_id;
+
+			if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags))
+				goto reupgrade;
+			old_id = cmpxchg(&conn->service_id, conn->params.service_id,
+					 sp->hdr.serviceId);
+
+			if (old_id != conn->params.service_id &&
+			    old_id != sp->hdr.serviceId)
 				goto reupgrade;
-			conn->service_id = sp->hdr.serviceId;
 		}
 
 		if (sp->hdr.callNumber == 0) {
@@ -1305,7 +1343,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 			if (rxrpc_to_client(sp))
 				goto reject_packet;
 			if (call)
-				rxrpc_input_implicit_end_call(conn, call);
+				rxrpc_input_implicit_end_call(rx, conn, call);
 			call = NULL;
 		}
 
@@ -1325,7 +1363,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
 			goto bad_message;
 		if (sp->hdr.seq != 1)
 			goto discard;
-		call = rxrpc_new_incoming_call(local, rx, peer, conn, skb);
+		call = rxrpc_new_incoming_call(local, rx, skb);
 		if (!call)
 			goto reject_packet;
 		rxrpc_send_ping(call, skb, skew);
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index f3e6fc670da2..05b51bdbdd41 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -301,6 +301,8 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
 	if (rtt < 0)
 		return;
 
+	spin_lock(&peer->rtt_input_lock);
+
 	/* Replace the oldest datum in the RTT buffer */
 	sum -= peer->rtt_cache[cursor];
 	sum += rtt;
@@ -312,6 +314,8 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
 		peer->rtt_usage = usage;
 	}
 
+	spin_unlock(&peer->rtt_input_lock);
+
 	/* Now recalculate the average */
 	if (usage == RXRPC_RTT_CACHE_SIZE) {
 		avg = sum / RXRPC_RTT_CACHE_SIZE;
@@ -320,6 +324,7 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
 		do_div(avg, usage);
 	}
 
+	/* Don't need to update this under lock */
 	peer->rtt = avg;
 	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
 			   usage, avg);
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 2d39eaf19620..5691b7d266ca 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -225,6 +225,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 		peer->service_conns = RB_ROOT;
 		seqlock_init(&peer->service_conn_lock);
 		spin_lock_init(&peer->lock);
+		spin_lock_init(&peer->rtt_input_lock);
 		peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
 
 		if (RXRPC_TX_SMSS > 2190)
-- 
cgit v1.2.3-59-g8ed1b


From f355cfcdb251e22b9dfb78c0eef4005a9d902a35 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 10 Oct 2018 16:09:25 +0300
Subject: devlink: Fix param set handling for string type

In case devlink param type is string, it needs to copy the string value
it got from the input to devlink_param_value.

Fixes: e3b7ca18ad7b ("devlink: Add param set command")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  2 +-
 net/core/devlink.c    | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b9b89d6604d4..b0e17c025fdc 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -311,7 +311,7 @@ union devlink_param_value {
 	u8 vu8;
 	u16 vu16;
 	u32 vu32;
-	const char *vstr;
+	char vstr[DEVLINK_PARAM_MAX_STRING_VALUE];
 	bool vbool;
 };
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8c0ed225e280..d808af7a5c52 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2995,6 +2995,8 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
 				  struct genl_info *info,
 				  union devlink_param_value *value)
 {
+	int len;
+
 	if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
 	    !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
 		return -EINVAL;
@@ -3010,10 +3012,13 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
 		value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
 		break;
 	case DEVLINK_PARAM_TYPE_STRING:
-		if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) >
-		    DEVLINK_PARAM_MAX_STRING_VALUE)
+		len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]),
+			      nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
+		if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) ||
+		    len >= DEVLINK_PARAM_MAX_STRING_VALUE)
 			return -EINVAL;
-		value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+		strcpy(value->vstr,
+		       nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
 		break;
 	case DEVLINK_PARAM_TYPE_BOOL:
 		value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
-- 
cgit v1.2.3-59-g8ed1b


From 1276534c988ba752fa01bf090412a877ee783829 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 10 Oct 2018 16:09:26 +0300
Subject: devlink: Fix param cmode driverinit for string type

Driverinit configuration mode value is held by devlink to enable the
driver fetch the value after reload command. In case the param type is
string devlink should copy the value from driver string buffer to
devlink string buffer on devlink_param_driverinit_value_set() and
vice-versa on devlink_param_driverinit_value_get().

Fixes: ec01aeb1803e ("devlink: Add support for get/set driverinit value")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index d808af7a5c52..1a0de1677197 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3105,7 +3105,10 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
 		return -EOPNOTSUPP;
 
 	if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
-		param_item->driverinit_value = value;
+		if (param->type == DEVLINK_PARAM_TYPE_STRING)
+			strcpy(param_item->driverinit_value.vstr, value.vstr);
+		else
+			param_item->driverinit_value = value;
 		param_item->driverinit_value_valid = true;
 	} else {
 		if (!param->set)
@@ -4545,7 +4548,10 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
 					      DEVLINK_PARAM_CMODE_DRIVERINIT))
 		return -EOPNOTSUPP;
 
-	*init_val = param_item->driverinit_value;
+	if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+		strcpy(init_val->vstr, param_item->driverinit_value.vstr);
+	else
+		*init_val = param_item->driverinit_value;
 
 	return 0;
 }
@@ -4576,7 +4582,10 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 					      DEVLINK_PARAM_CMODE_DRIVERINIT))
 		return -EOPNOTSUPP;
 
-	param_item->driverinit_value = init_val;
+	if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+		strcpy(param_item->driverinit_value.vstr, init_val.vstr);
+	else
+		param_item->driverinit_value = init_val;
 	param_item->driverinit_value_valid = true;
 
 	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
-- 
cgit v1.2.3-59-g8ed1b


From bde74ad10eb55aaa472c37b107934e6b8563c25e Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 10 Oct 2018 16:09:27 +0300
Subject: devlink: Add helper function for safely copy string param

Devlink string param buffer is allocated at the size of
DEVLINK_PARAM_MAX_STRING_VALUE. Add helper function which makes sure
this size is not exceeded.
Renamed DEVLINK_PARAM_MAX_STRING_VALUE to
__DEVLINK_PARAM_MAX_STRING_VALUE to emphasize that it should be used by
devlink only. The driver should use the helper function instead to
verify it doesn't exceed the allowed length.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 12 ++++++++++--
 net/core/devlink.c    | 19 ++++++++++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b0e17c025fdc..99efc156a309 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -298,7 +298,7 @@ struct devlink_resource {
 
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
-#define DEVLINK_PARAM_MAX_STRING_VALUE 32
+#define __DEVLINK_PARAM_MAX_STRING_VALUE 32
 enum devlink_param_type {
 	DEVLINK_PARAM_TYPE_U8,
 	DEVLINK_PARAM_TYPE_U16,
@@ -311,7 +311,7 @@ union devlink_param_value {
 	u8 vu8;
 	u16 vu16;
 	u32 vu32;
-	char vstr[DEVLINK_PARAM_MAX_STRING_VALUE];
+	char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE];
 	bool vbool;
 };
 
@@ -553,6 +553,8 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
 int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
 				       union devlink_param_value init_val);
 void devlink_param_value_changed(struct devlink *devlink, u32 param_id);
+void devlink_param_value_str_fill(union devlink_param_value *dst_val,
+				  const char *src);
 struct devlink_region *devlink_region_create(struct devlink *devlink,
 					     const char *region_name,
 					     u32 region_max_snapshots,
@@ -789,6 +791,12 @@ devlink_param_value_changed(struct devlink *devlink, u32 param_id)
 {
 }
 
+static inline void
+devlink_param_value_str_fill(union devlink_param_value *dst_val,
+			     const char *src)
+{
+}
+
 static inline struct devlink_region *
 devlink_region_create(struct devlink *devlink,
 		      const char *region_name,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1a0de1677197..6bc42933be4a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3015,7 +3015,7 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
 		len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]),
 			      nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
 		if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) ||
-		    len >= DEVLINK_PARAM_MAX_STRING_VALUE)
+		    len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
 			return -EINVAL;
 		strcpy(value->vstr,
 		       nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
@@ -4617,6 +4617,23 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
 }
 EXPORT_SYMBOL_GPL(devlink_param_value_changed);
 
+/**
+ *	devlink_param_value_str_fill - Safely fill-up the string preventing
+ *				       from overflow of the preallocated buffer
+ *
+ *	@dst_val: destination devlink_param_value
+ *	@src: source buffer
+ */
+void devlink_param_value_str_fill(union devlink_param_value *dst_val,
+				  const char *src)
+{
+	size_t len;
+
+	len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE);
+	WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
+
 /**
  *	devlink_region_create - create a new address region
  *
-- 
cgit v1.2.3-59-g8ed1b


From 52b5d6f5dcf0e5201392f7d417148ccb537dbf6f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Oct 2018 06:59:35 -0700
Subject: net: make skb_partial_csum_set() more robust against overflows

syzbot managed to crash in skb_checksum_help() [1] :

        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));

Root cause is the following check in skb_partial_csum_set()

	if (unlikely(start > skb_headlen(skb)) ||
	    unlikely((int)start + off > skb_headlen(skb) - 2))
		return false;

If skb_headlen(skb) is 1, then (skb_headlen(skb) - 2) becomes 0xffffffff
and the check fails to detect that ((int)start + off) is off the limit,
since the compare is unsigned.

When we fix that, then the first condition (start > skb_headlen(skb))
becomes obsolete.

Then we should also check that (skb_headroom(skb) + start) wont
overflow 16bit field.

[1]
kernel BUG at net/core/dev.c:2880!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 7330 Comm: syz-executor4 Not tainted 4.19.0-rc6+ #253
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:skb_checksum_help+0x9e3/0xbb0 net/core/dev.c:2880
Code: 85 00 ff ff ff 48 c1 e8 03 42 80 3c 28 00 0f 84 09 fb ff ff 48 8b bd 00 ff ff ff e8 97 a8 b9 fb e9 f8 fa ff ff e8 2d 09 76 fb <0f> 0b 48 8b bd 28 ff ff ff e8 1f a8 b9 fb e9 b1 f6 ff ff 48 89 cf
RSP: 0018:ffff8801d83a6f60 EFLAGS: 00010293
RAX: ffff8801b9834380 RBX: ffff8801b9f8d8c0 RCX: ffffffff8608c6d7
RDX: 0000000000000000 RSI: ffffffff8608cc63 RDI: 0000000000000006
RBP: ffff8801d83a7068 R08: ffff8801b9834380 R09: 0000000000000000
R10: ffff8801d83a76d8 R11: 0000000000000000 R12: 0000000000000001
R13: 0000000000010001 R14: 000000000000ffff R15: 00000000000000a8
FS:  00007f1a66db5700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7d77f091b0 CR3: 00000001ba252000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 skb_csum_hwoffload_help+0x8f/0xe0 net/core/dev.c:3269
 validate_xmit_skb+0xa2a/0xf30 net/core/dev.c:3312
 __dev_queue_xmit+0xc2f/0x3950 net/core/dev.c:3797
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3838
 packet_snd net/packet/af_packet.c:2928 [inline]
 packet_sendmsg+0x422d/0x64c0 net/packet/af_packet.c:2953

Fixes: 5ff8dda3035d ("net: Ensure partial checksum offset is inside the skb head")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2c807f67aba..428094b577fc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4452,14 +4452,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
  */
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
 {
-	if (unlikely(start > skb_headlen(skb)) ||
-	    unlikely((int)start + off > skb_headlen(skb) - 2)) {
-		net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
-				     start, off, skb_headlen(skb));
+	u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+	u32 csum_start = skb_headroom(skb) + (u32)start;
+
+	if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
+		net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
+				     start, off, skb_headroom(skb), skb_headlen(skb));
 		return false;
 	}
 	skb->ip_summed = CHECKSUM_PARTIAL;
-	skb->csum_start = skb_headroom(skb) + start;
+	skb->csum_start = csum_start;
 	skb->csum_offset = off;
 	skb_set_transport_header(skb, start);
 	return true;
-- 
cgit v1.2.3-59-g8ed1b


From 9a4890bd6d6325a1c88564a20ab310b2d56f6094 Mon Sep 17 00:00:00 2001
From: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Date: Mon, 8 Oct 2018 09:17:11 -0700
Subject: rds: RDS (tcp) hangs on sendto() to unresponding address

In rds_send_mprds_hash(), if the calculated hash value is non-zero and
the MPRDS connections are not yet up, it will wait.  But it should not
wait if the send is non-blocking.  In this case, it should just use the
base c_path for sending the message.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 57b3d5a8b2db..fe785ee819dd 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1007,7 +1007,8 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 	return ret;
 }
 
-static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
+static int rds_send_mprds_hash(struct rds_sock *rs,
+			       struct rds_connection *conn, int nonblock)
 {
 	int hash;
 
@@ -1023,10 +1024,16 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
 		 * used.  But if we are interrupted, we have to use the zero
 		 * c_path in case the connection ends up being non-MP capable.
 		 */
-		if (conn->c_npaths == 0)
+		if (conn->c_npaths == 0) {
+			/* Cannot wait for the connection be made, so just use
+			 * the base c_path.
+			 */
+			if (nonblock)
+				return 0;
 			if (wait_event_interruptible(conn->c_hs_waitq,
 						     conn->c_npaths != 0))
 				hash = 0;
+		}
 		if (conn->c_npaths == 1)
 			hash = 0;
 	}
@@ -1256,7 +1263,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	}
 
 	if (conn->c_trans->t_mp_capable)
-		cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
+		cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
 	else
 		cpath = &conn->c_path[0];
 
-- 
cgit v1.2.3-59-g8ed1b


From 7abab7b9b498650404800a08765f44929fee8f31 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Tue, 9 Oct 2018 07:02:01 +0300
Subject: net/ipv6: stop leaking percpu memory in fib6 info

The fib6_info_alloc() function allocates percpu memory to hold per CPU
pointers to rt6_info, but this memory is never freed. Fix it.

Fixes: a64efe142f5e ("net/ipv6: introduce fib6_info struct and helpers")
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5516f55e214b..cbe46175bb59 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -196,6 +196,8 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
 				*ppcpu_rt = NULL;
 			}
 		}
+
+		free_percpu(f6i->rt6i_pcpu);
 	}
 
 	lwtstate_put(f6i->fib6_nh.nh_lwtstate);
-- 
cgit v1.2.3-59-g8ed1b


From af7d6cce53694a88d6a1bb60c9a239a6a5144459 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 9 Oct 2018 17:48:14 +0200
Subject: net: ipv4: update fnhe_pmtu when first hop's MTU changes

Since commit 5aad1de5ea2c ("ipv4: use separate genid for next hop
exceptions"), exceptions get deprecated separately from cached
routes. In particular, administrative changes don't clear PMTU anymore.

As Stefano described in commit e9fa1495d738 ("ipv6: Reflect MTU changes
on PMTU of exceptions for MTU-less routes"), the PMTU discovered before
the local MTU change can become stale:
 - if the local MTU is now lower than the PMTU, that PMTU is now
   incorrect
 - if the local MTU was the lowest value in the path, and is increased,
   we might discover a higher PMTU

Similarly to what commit e9fa1495d738 did for IPv6, update PMTU in those
cases.

If the exception was locked, the discovered PMTU was smaller than the
minimal accepted PMTU. In that case, if the new local MTU is smaller
than the current PMTU, let PMTU discovery figure out if locking of the
exception is still needed.

To do this, we need to know the old link MTU in the NETDEV_CHANGEMTU
notifier. By the time the notifier is called, dev->mtu has been
changed. This patch adds the old MTU as additional information in the
notifier structure, and a new call_netdevice_notifiers_u32() function.

Fixes: 5aad1de5ea2c ("ipv4: use separate genid for next hop exceptions")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 +++++++
 include/net/ip_fib.h      |  1 +
 net/core/dev.c            | 28 ++++++++++++++++++++++++--
 net/ipv4/fib_frontend.c   | 12 ++++++++----
 net/ipv4/fib_semantics.c  | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 92 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c7861e4b402c..d837dad24b4c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2458,6 +2458,13 @@ struct netdev_notifier_info {
 	struct netlink_ext_ack	*extack;
 };
 
+struct netdev_notifier_info_ext {
+	struct netdev_notifier_info info; /* must be first */
+	union {
+		u32 mtu;
+	} ext;
+};
+
 struct netdev_notifier_change_info {
 	struct netdev_notifier_info info; /* must be first */
 	unsigned int flags_changed;
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 69c91d1934c1..c9b7b136939d 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -394,6 +394,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
diff --git a/net/core/dev.c b/net/core/dev.c
index 82114e1111e6..93243479085f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1752,6 +1752,28 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
+/**
+ *	call_netdevice_notifiers_mtu - call all network notifier blocks
+ *	@val: value passed unmodified to notifier function
+ *	@dev: net_device pointer passed unmodified to notifier function
+ *	@arg: additional u32 argument passed to the notifier function
+ *
+ *	Call all network notifier blocks.  Parameters and return value
+ *	are as for raw_notifier_call_chain().
+ */
+static int call_netdevice_notifiers_mtu(unsigned long val,
+					struct net_device *dev, u32 arg)
+{
+	struct netdev_notifier_info_ext info = {
+		.info.dev = dev,
+		.ext.mtu = arg,
+	};
+
+	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
+
+	return call_netdevice_notifiers_info(val, &info.info);
+}
+
 #ifdef CONFIG_NET_INGRESS
 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 
@@ -7574,14 +7596,16 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 	err = __dev_set_mtu(dev, new_mtu);
 
 	if (!err) {
-		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+						   orig_mtu);
 		err = notifier_to_errno(err);
 		if (err) {
 			/* setting mtu back and notifying everyone again,
 			 * so that they have a chance to revert changes.
 			 */
 			__dev_set_mtu(dev, orig_mtu);
-			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+						     new_mtu);
 		}
 	}
 	return err;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2998b0e47d4b..0113993e9b2c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1243,7 +1243,8 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct netdev_notifier_changeupper_info *info;
+	struct netdev_notifier_changeupper_info *upper_info = ptr;
+	struct netdev_notifier_info_ext *info_ext = ptr;
 	struct in_device *in_dev;
 	struct net *net = dev_net(dev);
 	unsigned int flags;
@@ -1278,16 +1279,19 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 			fib_sync_up(dev, RTNH_F_LINKDOWN);
 		else
 			fib_sync_down_dev(dev, event, false);
-		/* fall through */
+		rt_cache_flush(net);
+		break;
 	case NETDEV_CHANGEMTU:
+		fib_sync_mtu(dev, info_ext->ext.mtu);
 		rt_cache_flush(net);
 		break;
 	case NETDEV_CHANGEUPPER:
-		info = ptr;
+		upper_info = ptr;
 		/* flush all routes if dev is linked to or unlinked from
 		 * an L3 master device (e.g., VRF)
 		 */
-		if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+		if (upper_info->upper_dev &&
+		    netif_is_l3_master(upper_info->upper_dev))
 			fib_disable_ip(dev, NETDEV_DOWN, true);
 		break;
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f3c89ccf14c5..446204ca7406 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1470,6 +1470,56 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
 	return NOTIFY_DONE;
 }
 
+/* Update the PMTU of exceptions when:
+ * - the new MTU of the first hop becomes smaller than the PMTU
+ * - the old MTU was the same as the PMTU, and it limited discovery of
+ *   larger MTUs on the path. With that limit raised, we can now
+ *   discover larger MTUs
+ * A special case is locked exceptions, for which the PMTU is smaller
+ * than the minimal accepted PMTU:
+ * - if the new MTU is greater than the PMTU, don't make any change
+ * - otherwise, unlock and set PMTU
+ */
+static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig)
+{
+	struct fnhe_hash_bucket *bucket;
+	int i;
+
+	bucket = rcu_dereference_protected(nh->nh_exceptions, 1);
+	if (!bucket)
+		return;
+
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
+		     fnhe;
+		     fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
+			if (fnhe->fnhe_mtu_locked) {
+				if (new <= fnhe->fnhe_pmtu) {
+					fnhe->fnhe_pmtu = new;
+					fnhe->fnhe_mtu_locked = false;
+				}
+			} else if (new < fnhe->fnhe_pmtu ||
+				   orig == fnhe->fnhe_pmtu) {
+				fnhe->fnhe_pmtu = new;
+			}
+		}
+	}
+}
+
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+	struct hlist_head *head = &fib_info_devhash[hash];
+	struct fib_nh *nh;
+
+	hlist_for_each_entry(nh, head, nh_hash) {
+		if (nh->nh_dev == dev)
+			nh_update_mtu(nh, dev->mtu, orig_mtu);
+	}
+}
+
 /* Event              force Flags           Description
  * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
-- 
cgit v1.2.3-59-g8ed1b


From 28d35bcdd3925e7293408cdb8aa5f2aac5f0d6e3 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 9 Oct 2018 17:48:15 +0200
Subject: net: ipv4: don't let PMTU updates increase route MTU

When an MTU update with PMTU smaller than net.ipv4.route.min_pmtu is
received, we must clamp its value. However, we can receive a PMTU
exception with PMTU < old_mtu < ip_rt_min_pmtu, which would lead to an
increase in PMTU.

To fix this, take the smallest of the old MTU and ip_rt_min_pmtu.

Before this patch, in case of an update, the exception's MTU would
always change. Now, an exception can have only its lock flag updated,
but not the MTU, so we need to add a check on locking to the following
"is this exception getting updated, or close to expiring?" test.

Fixes: d52e5a7e7ca4 ("ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b678466da451..8501554e96a4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1001,21 +1001,22 @@ out:	kfree_skb(skb);
 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
 	struct dst_entry *dst = &rt->dst;
+	u32 old_mtu = ipv4_mtu(dst);
 	struct fib_result res;
 	bool lock = false;
 
 	if (ip_mtu_locked(dst))
 		return;
 
-	if (ipv4_mtu(dst) < mtu)
+	if (old_mtu < mtu)
 		return;
 
 	if (mtu < ip_rt_min_pmtu) {
 		lock = true;
-		mtu = ip_rt_min_pmtu;
+		mtu = min(old_mtu, ip_rt_min_pmtu);
 	}
 
-	if (rt->rt_pmtu == mtu &&
+	if (rt->rt_pmtu == mtu && !lock &&
 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 		return;
 
-- 
cgit v1.2.3-59-g8ed1b


From 047491ea334a454fa0647ec99dadcc6dd38417e0 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 10 Oct 2018 17:34:01 +0200
Subject: tipc: set link tolerance correctly in broadcast link

In the patch referred to below we added link tolerance as an additional
criteria for declaring broadcast transmission "stale" and resetting the
affected links.

However, the 'tolerance' field of the broadcast link is never set, and
remains at zero. This renders the whole commit without the intended
improving effect, but luckily also with no negative effect.

In this commit we add the missing initialization.

Fixes: a4dc70d46cf1 ("tipc: extend link reset criteria for stale packet retransmission")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index fb886b525d95..d229a36968da 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -477,6 +477,8 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 	l->in_session = false;
 	l->bearer_id = bearer_id;
 	l->tolerance = tolerance;
+	if (bc_rcvlink)
+		bc_rcvlink->tolerance = tolerance;
 	l->net_plane = net_plane;
 	l->advertised_mtu = mtu;
 	l->mtu = mtu;
@@ -1031,7 +1033,7 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
 	/* Detect repeated retransmit failures on same packet */
 	if (r->last_retransm != buf_seqno(skb)) {
 		r->last_retransm = buf_seqno(skb);
-		r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance);
+		r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
 	} else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
 		link_retransmit_failure(l, skb);
 		if (link_is_bc_sndlink(l))
@@ -1576,9 +1578,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 		strncpy(if_name, data, TIPC_MAX_IF_NAME);
 
 		/* Update own tolerance if peer indicates a non-zero value */
-		if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+		if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
 			l->tolerance = peers_tol;
-
+			l->bc_rcvlink->tolerance = peers_tol;
+		}
 		/* Update own priority if peer's priority is higher */
 		if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
 			l->priority = peers_prio;
@@ -1604,9 +1607,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
 		l->rcv_nxt_state = msg_seqno(hdr) + 1;
 
 		/* Update own tolerance if peer indicates a non-zero value */
-		if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+		if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
 			l->tolerance = peers_tol;
-
+			l->bc_rcvlink->tolerance = peers_tol;
+		}
 		/* Update own prio if peer indicates a different value */
 		if ((peers_prio != l->priority) &&
 		    in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) {
@@ -2223,6 +2227,8 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
 			     struct sk_buff_head *xmitq)
 {
 	l->tolerance = tol;
+	if (l->bc_rcvlink)
+		l->bc_rcvlink->tolerance = tol;
 	if (link_is_up(l))
 		tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq);
 }
-- 
cgit v1.2.3-59-g8ed1b


From e7eb05823806502747eadc31039cecfd7836ddeb Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Wed, 10 Oct 2018 17:50:23 +0200
Subject: tipc: queue socket protocol error messages into socket receive buffer

In tipc_sk_filter_rcv(), when we detect protocol messages with error we
call tipc_sk_conn_proto_rcv() and let it reset the connection and notify
the socket by calling sk->sk_state_change().

However, tipc_sk_filter_rcv() may have been called from the function
tipc_backlog_rcv(), in which case the socket lock is held and the socket
already awake. This means that the sk_state_change() call is ignored and
the error notification lost. Now the receive queue will remain empty and
the socket sleeps forever.

In this commit, we convert the protocol message into a connection abort
message and enqueue it into the socket's receive queue. By this addition
to the above state change we cover all conditions.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index b6f99b021d09..49810fdff4c5 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1196,6 +1196,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
  * @skb: pointer to message buffer.
  */
 static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
+				   struct sk_buff_head *inputq,
 				   struct sk_buff_head *xmitq)
 {
 	struct tipc_msg *hdr = buf_msg(skb);
@@ -1213,7 +1214,16 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
 		tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
 				      tsk_peer_port(tsk));
 		sk->sk_state_change(sk);
-		goto exit;
+
+		/* State change is ignored if socket already awake,
+		 * - convert msg to abort msg and add to inqueue
+		 */
+		msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE);
+		msg_set_type(hdr, TIPC_CONN_MSG);
+		msg_set_size(hdr, BASIC_H_SIZE);
+		msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+		__skb_queue_tail(inputq, skb);
+		return;
 	}
 
 	tsk->probe_unacked = false;
@@ -1936,7 +1946,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 
 	switch (msg_user(hdr)) {
 	case CONN_MANAGER:
-		tipc_sk_conn_proto_rcv(tsk, skb, xmitq);
+		tipc_sk_conn_proto_rcv(tsk, skb, inputq, xmitq);
 		return;
 	case SOCK_WAKEUP:
 		tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0);
-- 
cgit v1.2.3-59-g8ed1b


From a1f8dd34e64af689e95122921fb2ca83dedd4c4e Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Thu, 11 Oct 2018 19:57:56 +0800
Subject: tipc: eliminate possible recursive locking detected by LOCKDEP

When booting kernel with LOCKDEP option, below warning info was found:

WARNING: possible recursive locking detected
4.19.0-rc7+ #14 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850

but task is already holding lock:
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&(&list->lock)->rlock#4);
  lock(&(&list->lock)->rlock#4);

 *** DEADLOCK ***

 May be due to missing lock nesting notation

2 locks held by swapper/0/1:
 #0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
 #1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
spin_lock_bh include/linux/spinlock.h:334 [inline]
 #1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849

stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1af/0x295 lib/dump_stack.c:113
 print_deadlock_bug kernel/locking/lockdep.c:1759 [inline]
 check_deadlock kernel/locking/lockdep.c:1803 [inline]
 validate_chain kernel/locking/lockdep.c:2399 [inline]
 __lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
 lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
 _raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
 spin_lock_bh include/linux/spinlock.h:334 [inline]
 tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
 tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
 tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
 tipc_init_net+0x472/0x610 net/tipc/core.c:82
 ops_init+0xf7/0x520 net/core/net_namespace.c:129
 __register_pernet_operations net/core/net_namespace.c:940 [inline]
 register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
 register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
 tipc_init+0x83/0x104 net/tipc/core.c:140
 do_one_initcall+0x109/0x70a init/main.c:885
 do_initcall_level init/main.c:953 [inline]
 do_initcalls init/main.c:961 [inline]
 do_basic_setup init/main.c:979 [inline]
 kernel_init_freeable+0x4bd/0x57f init/main.c:1144
 kernel_init+0x13/0x180 init/main.c:1063
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413

The reason why the noise above was complained by LOCKDEP is because we
nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
function. In fact it's unnecessary to move skb buffer from l->wakeupq
queue to l->inputq queue while holding the two locks at the same time.
Instead, we can move skb buffers in l->wakeupq queue to a temporary
list first and then move the buffers of the temporary list to l->inputq
queue, which is also safe for us.

Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index d229a36968da..f6552e4f4b43 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -845,14 +845,21 @@ static void link_prepare_wakeup(struct tipc_link *l)
 
 void tipc_link_reset(struct tipc_link *l)
 {
+	struct sk_buff_head list;
+
+	__skb_queue_head_init(&list);
+
 	l->in_session = false;
 	l->session++;
 	l->mtu = l->advertised_mtu;
+
 	spin_lock_bh(&l->wakeupq.lock);
+	skb_queue_splice_init(&l->wakeupq, &list);
+	spin_unlock_bh(&l->wakeupq.lock);
+
 	spin_lock_bh(&l->inputq->lock);
-	skb_queue_splice_init(&l->wakeupq, l->inputq);
+	skb_queue_splice_init(&list, l->inputq);
 	spin_unlock_bh(&l->inputq->lock);
-	spin_unlock_bh(&l->wakeupq.lock);
 
 	__skb_queue_purge(&l->transmq);
 	__skb_queue_purge(&l->deferdq);
-- 
cgit v1.2.3-59-g8ed1b