From 5648b5e1169ff1d6d6a46c35c0b5fbebd2a5cbb2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Oct 2021 18:08:10 +0200
Subject: netfilter: nfnetlink_queue: fix OOB when mac header was cleared

On 64bit platforms the MAC header is set to 0xffff on allocation and
also when a helper like skb_unset_mac_header() is called.

dev_parse_header may call skb_mac_header() which assumes valid mac offset:

 BUG: KASAN: use-after-free in eth_header_parse+0x75/0x90
 Read of size 6 at addr ffff8881075a5c05 by task nf-queue/1364
 Call Trace:
  memcpy+0x20/0x60
  eth_header_parse+0x75/0x90
  __nfqnl_enqueue_packet+0x1a61/0x3380
  __nf_queue+0x597/0x1300
  nf_queue+0xf/0x40
  nf_hook_slow+0xed/0x190
  nf_hook+0x184/0x440
  ip_output+0x1c0/0x2a0
  nf_reinject+0x26f/0x700
  nfqnl_recv_verdict+0xa16/0x18b0
  nfnetlink_rcv_msg+0x506/0xe70

The existing code only works if the skb has a mac header.

Fixes: 2c38de4c1f8da7 ("netfilter: fix looped (broad|multi)cast's MAC handling")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_queue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 4c3fbaaeb103..4acc4b8e9fe5 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -560,7 +560,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		goto nla_put_failure;
 
 	if (indev && entskb->dev &&
-	    entskb->mac_header != entskb->network_header) {
+	    skb_mac_header_was_set(entskb)) {
 		struct nfqnl_msg_packet_hw phw;
 		int len;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2199f562730dd1382946e0a2532afc38cd444129 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Thu, 21 Oct 2021 15:02:55 +0200
Subject: ipvs: autoload ipvs on genl access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kernel provides the functionality to automatically load modules
providing genl families. Use this to remove the need for users to
manually load the module.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 29ec3ef63edc..0ff94c66641f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -48,6 +48,8 @@
 
 #include <net/ip_vs.h>
 
+MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
+
 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
 static DEFINE_MUTEX(__ip_vs_mutex);
 
-- 
cgit v1.2.3-59-g8ed1b


From c4777efa751d293e369aec464ce6875e957be255 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 1 Nov 2021 17:45:55 -0700
Subject: net: add and use skb_unclone_keeptruesize() helper

While commit 097b9146c0e2 ("net: fix up truesize of cloned
skb in skb_prepare_for_shift()") fixed immediate issues found
when KFENCE was enabled/tested, there are still similar issues,
when tcp_trim_head() hits KFENCE while the master skb
is cloned.

This happens under heavy networking TX workloads,
when the TX completion might be delayed after incoming ACK.

This patch fixes the WARNING in sk_stream_kill_queues
when sk->sk_mem_queued/sk->sk_forward_alloc are not zero.

Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20211102004555.1359210-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 16 ++++++++++++++++
 net/core/skbuff.c      | 14 +-------------
 net/ipv4/tcp_output.c  |  6 +++---
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0bd6520329f6..a63e13082397 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1675,6 +1675,22 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
 	return 0;
 }
 
+/* This variant of skb_unclone() makes sure skb->truesize is not changed */
+static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+	might_sleep_if(gfpflags_allow_blocking(pri));
+
+	if (skb_cloned(skb)) {
+		unsigned int save = skb->truesize;
+		int res;
+
+		res = pskb_expand_head(skb, 0, 0, pri);
+		skb->truesize = save;
+		return res;
+	}
+	return 0;
+}
+
 /**
  *	skb_header_cloned - is the header a clone
  *	@skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 67a9188d8a49..3ec42cdee16a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3449,19 +3449,7 @@ EXPORT_SYMBOL(skb_split);
  */
 static int skb_prepare_for_shift(struct sk_buff *skb)
 {
-	int ret = 0;
-
-	if (skb_cloned(skb)) {
-		/* Save and restore truesize: pskb_expand_head() may reallocate
-		 * memory where ksize(kmalloc(S)) != ksize(kmalloc(S)), but we
-		 * cannot change truesize at this point.
-		 */
-		unsigned int save_truesize = skb->truesize;
-
-		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
-		skb->truesize = save_truesize;
-	}
-	return ret;
+	return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
 }
 
 /**
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6fbbf1558033..76cc1641beb4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1559,7 +1559,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 		return -ENOMEM;
 	}
 
-	if (skb_unclone(skb, gfp))
+	if (skb_unclone_keeptruesize(skb, gfp))
 		return -ENOMEM;
 
 	/* Get a new skb... force flag on. */
@@ -1667,7 +1667,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 {
 	u32 delta_truesize;
 
-	if (skb_unclone(skb, GFP_ATOMIC))
+	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
 		return -ENOMEM;
 
 	delta_truesize = __pskb_trim_head(skb, len);
@@ -3166,7 +3166,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 				 cur_mss, GFP_ATOMIC))
 			return -ENOMEM; /* We'll try again later. */
 	} else {
-		if (skb_unclone(skb, GFP_ATOMIC))
+		if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
 			return -ENOMEM;
 
 		diff = tcp_skb_pcount(skb);
-- 
cgit v1.2.3-59-g8ed1b


From c081d53f97a1a90a38e4296dd3d6fda5e38dca2c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 2 Nov 2021 08:02:47 -0400
Subject: security: pass asoc to sctp_assoc_request and sctp_sk_clone

This patch is to move secid and peer_secid from endpoint to association,
and pass asoc to sctp_assoc_request and sctp_sk_clone instead of ep. As
ep is the local endpoint and asoc represents a connection, and in SCTP
one sk/ep could have multiple asoc/connection, saving secid/peer_secid
for new asoc will overwrite the old asoc's.

Note that since asoc can be passed as NULL, security_sctp_assoc_request()
is moved to the place right after the new_asoc is created in
sctp_sf_do_5_1B_init() and sctp_sf_do_unexpected_init().

v1->v2:
  - fix the description of selinux_netlbl_skbuff_setsid(), as Jakub noticed.
  - fix the annotation in selinux_sctp_assoc_request(), as Richard Noticed.

Fixes: 72e89f50084c ("security: Add support for SCTP security hooks")
Reported-by: Prashanth Prahlad <pprahlad@redhat.com>
Reviewed-by: Richard Haines <richard_c_haines@btinternet.com>
Tested-by: Richard Haines <richard_c_haines@btinternet.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/security/SCTP.rst     | 28 ++++++++++++++--------------
 include/linux/lsm_hook_defs.h       |  4 ++--
 include/linux/lsm_hooks.h           |  8 ++++----
 include/linux/security.h            | 10 +++++-----
 include/net/sctp/structs.h          | 20 ++++++++++----------
 net/sctp/sm_statefuns.c             | 26 +++++++++++++-------------
 net/sctp/socket.c                   |  5 ++---
 security/security.c                 |  8 ++++----
 security/selinux/hooks.c            | 22 +++++++++++-----------
 security/selinux/include/netlabel.h |  4 ++--
 security/selinux/netlabel.c         | 18 +++++++++---------
 11 files changed, 76 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst
index 0bcf6c1245ee..415b548d9ce0 100644
--- a/Documentation/security/SCTP.rst
+++ b/Documentation/security/SCTP.rst
@@ -26,11 +26,11 @@ described in the `SCTP SELinux Support`_ chapter.
 
 security_sctp_assoc_request()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Passes the ``@ep`` and ``@chunk->skb`` of the association INIT packet to the
+Passes the ``@asoc`` and ``@chunk->skb`` of the association INIT packet to the
 security module. Returns 0 on success, error on failure.
 ::
 
-    @ep - pointer to sctp endpoint structure.
+    @asoc - pointer to sctp association structure.
     @skb - pointer to skbuff of association packet.
 
 
@@ -117,9 +117,9 @@ Called whenever a new socket is created by **accept**\(2)
 calls **sctp_peeloff**\(3).
 ::
 
-    @ep - pointer to current sctp endpoint structure.
+    @asoc - pointer to current sctp association structure.
     @sk - pointer to current sock structure.
-    @sk - pointer to new sock structure.
+    @newsk - pointer to new sock structure.
 
 
 security_inet_conn_established()
@@ -200,22 +200,22 @@ hooks with the SELinux specifics expanded below::
 
 security_sctp_assoc_request()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Passes the ``@ep`` and ``@chunk->skb`` of the association INIT packet to the
+Passes the ``@asoc`` and ``@chunk->skb`` of the association INIT packet to the
 security module. Returns 0 on success, error on failure.
 ::
 
-    @ep - pointer to sctp endpoint structure.
+    @asoc - pointer to sctp association structure.
     @skb - pointer to skbuff of association packet.
 
 The security module performs the following operations:
-     IF this is the first association on ``@ep->base.sk``, then set the peer
+     IF this is the first association on ``@asoc->base.sk``, then set the peer
      sid to that in ``@skb``. This will ensure there is only one peer sid
-     assigned to ``@ep->base.sk`` that may support multiple associations.
+     assigned to ``@asoc->base.sk`` that may support multiple associations.
 
-     ELSE validate the ``@ep->base.sk peer_sid`` against the ``@skb peer sid``
+     ELSE validate the ``@asoc->base.sk peer_sid`` against the ``@skb peer sid``
      to determine whether the association should be allowed or denied.
 
-     Set the sctp ``@ep sid`` to socket's sid (from ``ep->base.sk``) with
+     Set the sctp ``@asoc sid`` to socket's sid (from ``asoc->base.sk``) with
      MLS portion taken from ``@skb peer sid``. This will be used by SCTP
      TCP style sockets and peeled off connections as they cause a new socket
      to be generated.
@@ -259,13 +259,13 @@ security_sctp_sk_clone()
 Called whenever a new socket is created by **accept**\(2) (i.e. a TCP style
 socket) or when a socket is 'peeled off' e.g userspace calls
 **sctp_peeloff**\(3). ``security_sctp_sk_clone()`` will set the new
-sockets sid and peer sid to that contained in the ``@ep sid`` and
-``@ep peer sid`` respectively.
+sockets sid and peer sid to that contained in the ``@asoc sid`` and
+``@asoc peer sid`` respectively.
 ::
 
-    @ep - pointer to current sctp endpoint structure.
+    @asoc - pointer to current sctp association structure.
     @sk - pointer to current sock structure.
-    @sk - pointer to new sock structure.
+    @newsk - pointer to new sock structure.
 
 
 security_inet_conn_established()
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index a9ac70ae01ab..df8de62f4710 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -329,11 +329,11 @@ LSM_HOOK(int, 0, tun_dev_create, void)
 LSM_HOOK(int, 0, tun_dev_attach_queue, void *security)
 LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security)
 LSM_HOOK(int, 0, tun_dev_open, void *security)
-LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep,
+LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc,
 	 struct sk_buff *skb)
 LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
-LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep,
+LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
 #endif /* CONFIG_SECURITY_NETWORK */
 
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0bada4df23fc..d45b6f6e27fd 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1027,9 +1027,9 @@
  * Security hooks for SCTP
  *
  * @sctp_assoc_request:
- *	Passes the @ep and @chunk->skb of the association INIT packet to
+ *	Passes the @asoc and @chunk->skb of the association INIT packet to
  *	the security module.
- *	@ep pointer to sctp endpoint structure.
+ *	@asoc pointer to sctp association structure.
  *	@skb pointer to skbuff of association packet.
  *	Return 0 on success, error on failure.
  * @sctp_bind_connect:
@@ -1047,9 +1047,9 @@
  *	Called whenever a new socket is created by accept(2) (i.e. a TCP
  *	style socket) or when a socket is 'peeled off' e.g userspace
  *	calls sctp_peeloff(3).
- *	@ep pointer to current sctp endpoint structure.
+ *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
- *	@sk pointer to new sock structure.
+ *	@newsk pointer to new sock structure.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 7e0ba63b5dde..bbf44a466832 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -179,7 +179,7 @@ struct xfrm_policy;
 struct xfrm_state;
 struct xfrm_user_sec_ctx;
 struct seq_file;
-struct sctp_endpoint;
+struct sctp_association;
 
 #ifdef CONFIG_MMU
 extern unsigned long mmap_min_addr;
@@ -1425,10 +1425,10 @@ int security_tun_dev_create(void);
 int security_tun_dev_attach_queue(void *security);
 int security_tun_dev_attach(struct sock *sk, void *security);
 int security_tun_dev_open(void *security);
-int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb);
+int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb);
 int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
-void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
+void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
 
 #else	/* CONFIG_SECURITY_NETWORK */
@@ -1631,7 +1631,7 @@ static inline int security_tun_dev_open(void *security)
 	return 0;
 }
 
-static inline int security_sctp_assoc_request(struct sctp_endpoint *ep,
+static inline int security_sctp_assoc_request(struct sctp_association *asoc,
 					      struct sk_buff *skb)
 {
 	return 0;
@@ -1644,7 +1644,7 @@ static inline int security_sctp_bind_connect(struct sock *sk, int optname,
 	return 0;
 }
 
-static inline void security_sctp_sk_clone(struct sctp_endpoint *ep,
+static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *sk,
 					  struct sock *newsk)
 {
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 651bba654d77..899c29c326ba 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1355,16 +1355,6 @@ struct sctp_endpoint {
 	      reconf_enable:1;
 
 	__u8  strreset_enable;
-
-	/* Security identifiers from incoming (INIT). These are set by
-	 * security_sctp_assoc_request(). These will only be used by
-	 * SCTP TCP type sockets and peeled off connections as they
-	 * cause a new socket to be generated. security_sctp_sk_clone()
-	 * will then plug these into the new socket.
-	 */
-
-	u32 secid;
-	u32 peer_secid;
 };
 
 /* Recover the outter endpoint structure. */
@@ -2104,6 +2094,16 @@ struct sctp_association {
 	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
 	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
 
+	/* Security identifiers from incoming (INIT). These are set by
+	 * security_sctp_assoc_request(). These will only be used by
+	 * SCTP TCP type sockets and peeled off connections as they
+	 * cause a new socket to be generated. security_sctp_sk_clone()
+	 * will then plug these into the new socket.
+	 */
+
+	u32 secid;
+	u32 peer_secid;
+
 	struct rcu_head rcu;
 };
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index fb3da4d8f4a3..3206374209bc 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -326,11 +326,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
 	struct sctp_packet *packet;
 	int len;
 
-	/* Update socket peer label if first association. */
-	if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
-					chunk->skb))
-		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-
 	/* 6.10 Bundling
 	 * An endpoint MUST NOT bundle INIT, INIT ACK or
 	 * SHUTDOWN COMPLETE with any other chunks.
@@ -415,6 +410,12 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
 	if (!new_asoc)
 		goto nomem;
 
+	/* Update socket peer label if first association. */
+	if (security_sctp_assoc_request(new_asoc, chunk->skb)) {
+		sctp_association_free(new_asoc);
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	}
+
 	if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
 					     sctp_scope(sctp_source(chunk)),
 					     GFP_ATOMIC) < 0)
@@ -780,7 +781,6 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 		}
 	}
 
-
 	/* Delay state machine commands until later.
 	 *
 	 * Re-build the bind address for the association is done in
@@ -1517,11 +1517,6 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
 	struct sctp_packet *packet;
 	int len;
 
-	/* Update socket peer label if first association. */
-	if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
-					chunk->skb))
-		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
-
 	/* 6.10 Bundling
 	 * An endpoint MUST NOT bundle INIT, INIT ACK or
 	 * SHUTDOWN COMPLETE with any other chunks.
@@ -1594,6 +1589,12 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
 	if (!new_asoc)
 		goto nomem;
 
+	/* Update socket peer label if first association. */
+	if (security_sctp_assoc_request(new_asoc, chunk->skb)) {
+		sctp_association_free(new_asoc);
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	}
+
 	if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
 				sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0)
 		goto nomem;
@@ -2255,8 +2256,7 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
 	}
 
 	/* Update socket peer label if first association. */
-	if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
-					chunk->skb)) {
+	if (security_sctp_assoc_request(new_asoc, chunk->skb)) {
 		sctp_association_free(new_asoc);
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 	}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6b937bfd4751..33391254fa82 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9412,7 +9412,6 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	struct inet_sock *inet = inet_sk(sk);
 	struct inet_sock *newinet;
 	struct sctp_sock *sp = sctp_sk(sk);
-	struct sctp_endpoint *ep = sp->ep;
 
 	newsk->sk_type = sk->sk_type;
 	newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
@@ -9457,9 +9456,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		net_enable_timestamp();
 
 	/* Set newsk security attributes from original sk and connection
-	 * security attribute from ep.
+	 * security attribute from asoc.
 	 */
-	security_sctp_sk_clone(ep, sk, newsk);
+	security_sctp_sk_clone(asoc, sk, newsk);
 }
 
 static inline void sctp_copy_descendant(struct sock *sk_to,
diff --git a/security/security.c b/security/security.c
index 95e30fadba78..c88167a414b4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2367,9 +2367,9 @@ int security_tun_dev_open(void *security)
 }
 EXPORT_SYMBOL(security_tun_dev_open);
 
-int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb)
+int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb)
 {
-	return call_int_hook(sctp_assoc_request, 0, ep, skb);
+	return call_int_hook(sctp_assoc_request, 0, asoc, skb);
 }
 EXPORT_SYMBOL(security_sctp_assoc_request);
 
@@ -2381,10 +2381,10 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 }
 EXPORT_SYMBOL(security_sctp_bind_connect);
 
-void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
+void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk)
 {
-	call_void_hook(sctp_sk_clone, ep, sk, newsk);
+	call_void_hook(sctp_sk_clone, asoc, sk, newsk);
 }
 EXPORT_SYMBOL(security_sctp_sk_clone);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ea7b2876a5ae..62d30c0a30c2 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5339,10 +5339,10 @@ static void selinux_sock_graft(struct sock *sk, struct socket *parent)
  * connect(2), sctp_connectx(3) or sctp_sendmsg(3) (with no association
  * already present).
  */
-static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
+static int selinux_sctp_assoc_request(struct sctp_association *asoc,
 				      struct sk_buff *skb)
 {
-	struct sk_security_struct *sksec = ep->base.sk->sk_security;
+	struct sk_security_struct *sksec = asoc->base.sk->sk_security;
 	struct common_audit_data ad;
 	struct lsm_network_audit net = {0,};
 	u8 peerlbl_active;
@@ -5359,7 +5359,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
 		/* This will return peer_sid = SECSID_NULL if there are
 		 * no peer labels, see security_net_peersid_resolve().
 		 */
-		err = selinux_skb_peerlbl_sid(skb, ep->base.sk->sk_family,
+		err = selinux_skb_peerlbl_sid(skb, asoc->base.sk->sk_family,
 					      &peer_sid);
 		if (err)
 			return err;
@@ -5383,7 +5383,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
 		 */
 		ad.type = LSM_AUDIT_DATA_NET;
 		ad.u.net = &net;
-		ad.u.net->sk = ep->base.sk;
+		ad.u.net->sk = asoc->base.sk;
 		err = avc_has_perm(&selinux_state,
 				   sksec->peer_sid, peer_sid, sksec->sclass,
 				   SCTP_SOCKET__ASSOCIATION, &ad);
@@ -5392,7 +5392,7 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
 	}
 
 	/* Compute the MLS component for the connection and store
-	 * the information in ep. This will be used by SCTP TCP type
+	 * the information in asoc. This will be used by SCTP TCP type
 	 * sockets and peeled off connections as they cause a new
 	 * socket to be generated. selinux_sctp_sk_clone() will then
 	 * plug this into the new socket.
@@ -5401,11 +5401,11 @@ static int selinux_sctp_assoc_request(struct sctp_endpoint *ep,
 	if (err)
 		return err;
 
-	ep->secid = conn_sid;
-	ep->peer_secid = peer_sid;
+	asoc->secid = conn_sid;
+	asoc->peer_secid = peer_sid;
 
 	/* Set any NetLabel labels including CIPSO/CALIPSO options. */
-	return selinux_netlbl_sctp_assoc_request(ep, skb);
+	return selinux_netlbl_sctp_assoc_request(asoc, skb);
 }
 
 /* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting
@@ -5490,7 +5490,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname,
 }
 
 /* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */
-static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
+static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 				  struct sock *newsk)
 {
 	struct sk_security_struct *sksec = sk->sk_security;
@@ -5502,8 +5502,8 @@ static void selinux_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk,
 	if (!selinux_policycap_extsockclass())
 		return selinux_sk_clone_security(sk, newsk);
 
-	newsksec->sid = ep->secid;
-	newsksec->peer_sid = ep->peer_secid;
+	newsksec->sid = asoc->secid;
+	newsksec->peer_sid = asoc->peer_secid;
 	newsksec->sclass = sksec->sclass;
 	selinux_netlbl_sctp_sk_clone(sk, newsk);
 }
diff --git a/security/selinux/include/netlabel.h b/security/selinux/include/netlabel.h
index 0c58f62dc6ab..4d0456d3d459 100644
--- a/security/selinux/include/netlabel.h
+++ b/security/selinux/include/netlabel.h
@@ -39,7 +39,7 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
 int selinux_netlbl_skbuff_setsid(struct sk_buff *skb,
 				 u16 family,
 				 u32 sid);
-int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep,
+int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc,
 				     struct sk_buff *skb);
 int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family);
 void selinux_netlbl_inet_csk_clone(struct sock *sk, u16 family);
@@ -98,7 +98,7 @@ static inline int selinux_netlbl_skbuff_setsid(struct sk_buff *skb,
 	return 0;
 }
 
-static inline int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep,
+static inline int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc,
 						    struct sk_buff *skb)
 {
 	return 0;
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 29b88e81869b..1321f15799e2 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -261,30 +261,30 @@ skbuff_setsid_return:
 
 /**
  * selinux_netlbl_sctp_assoc_request - Label an incoming sctp association.
- * @ep: incoming association endpoint.
+ * @asoc: incoming association.
  * @skb: the packet.
  *
  * Description:
- * A new incoming connection is represented by @ep, ......
+ * A new incoming connection is represented by @asoc, ......
  * Returns zero on success, negative values on failure.
  *
  */
-int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep,
+int selinux_netlbl_sctp_assoc_request(struct sctp_association *asoc,
 				     struct sk_buff *skb)
 {
 	int rc;
 	struct netlbl_lsm_secattr secattr;
-	struct sk_security_struct *sksec = ep->base.sk->sk_security;
+	struct sk_security_struct *sksec = asoc->base.sk->sk_security;
 	struct sockaddr_in addr4;
 	struct sockaddr_in6 addr6;
 
-	if (ep->base.sk->sk_family != PF_INET &&
-				ep->base.sk->sk_family != PF_INET6)
+	if (asoc->base.sk->sk_family != PF_INET &&
+	    asoc->base.sk->sk_family != PF_INET6)
 		return 0;
 
 	netlbl_secattr_init(&secattr);
 	rc = security_netlbl_sid_to_secattr(&selinux_state,
-					    ep->secid, &secattr);
+					    asoc->secid, &secattr);
 	if (rc != 0)
 		goto assoc_request_return;
 
@@ -294,11 +294,11 @@ int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep,
 	if (ip_hdr(skb)->version == 4) {
 		addr4.sin_family = AF_INET;
 		addr4.sin_addr.s_addr = ip_hdr(skb)->saddr;
-		rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr4, &secattr);
+		rc = netlbl_conn_setattr(asoc->base.sk, (void *)&addr4, &secattr);
 	} else if (IS_ENABLED(CONFIG_IPV6) && ip_hdr(skb)->version == 6) {
 		addr6.sin6_family = AF_INET6;
 		addr6.sin6_addr = ipv6_hdr(skb)->saddr;
-		rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr6, &secattr);
+		rc = netlbl_conn_setattr(asoc->base.sk, (void *)&addr6, &secattr);
 	} else {
 		rc = -EAFNOSUPPORT;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From e215dab1c49070cd75620afd801f777207a5b65c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 2 Nov 2021 08:02:48 -0400
Subject: security: call security_sctp_assoc_request in sctp_sf_do_5_1D_ce

The asoc created when receives the INIT chunk is a temporary one, it
will be deleted after INIT_ACK chunk is replied. So for the real asoc
created in sctp_sf_do_5_1D_ce() when the COOKIE_ECHO chunk is received,
security_sctp_assoc_request() should also be called.

v1->v2:
  - fix some typo and grammar errors, noticed by Ondrej.

Fixes: 72e89f50084c ("security: Add support for SCTP security hooks")
Reported-by: Prashanth Prahlad <pprahlad@redhat.com>
Reviewed-by: Richard Haines <richard_c_haines@btinternet.com>
Tested-by: Richard Haines <richard_c_haines@btinternet.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/security/SCTP.rst | 15 +++++++++------
 net/sctp/sm_statefuns.c         |  5 +++++
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst
index 415b548d9ce0..d5fd6ccc3dcb 100644
--- a/Documentation/security/SCTP.rst
+++ b/Documentation/security/SCTP.rst
@@ -151,9 +151,9 @@ establishing an association.
          INIT --------------------------------------------->
                                                    sctp_sf_do_5_1B_init()
                                                  Respond to an INIT chunk.
-                                             SCTP peer endpoint "A" is
-                                             asking for an association. Call
-                                             security_sctp_assoc_request()
+                                             SCTP peer endpoint "A" is asking
+                                             for a temporary association.
+                                             Call security_sctp_assoc_request()
                                              to set the peer label if first
                                              association.
                                              If not first association, check
@@ -163,9 +163,12 @@ establishing an association.
           |                                       discard the packet.
           |
     COOKIE ECHO ------------------------------------------>
-                                                          |
-                                                          |
-                                                          |
+                                                  sctp_sf_do_5_1D_ce()
+                                             Respond to an COOKIE ECHO chunk.
+                                             Confirm the cookie and create a
+                                             permanent association.
+                                             Call security_sctp_assoc_request() to
+                                             do the same as for INIT chunk Response.
           <------------------------------------------- COOKIE ACK
           |                                               |
     sctp_sf_do_5_1E_ca                                    |
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3206374209bc..b818532c3fc2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -781,6 +781,11 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 		}
 	}
 
+	if (security_sctp_assoc_request(new_asoc, chunk->skb)) {
+		sctp_association_free(new_asoc);
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+	}
+
 	/* Delay state machine commands until later.
 	 *
 	 * Re-build the bind address for the association is done in
-- 
cgit v1.2.3-59-g8ed1b


From 7c2ef0240e6abfd3cc59511339517358350a8910 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 2 Nov 2021 08:02:49 -0400
Subject: security: add sctp_assoc_established hook

security_sctp_assoc_established() is added to replace
security_inet_conn_established() called in
sctp_sf_do_5_1E_ca(), so that asoc can be accessed in security
subsystem and save the peer secid to asoc->peer_secid.

v1->v2:
  - fix the return value of security_sctp_assoc_established() in
    security.h, found by kernel test robot and Ondrej.

Fixes: 72e89f50084c ("security: Add support for SCTP security hooks")
Reported-by: Prashanth Prahlad <pprahlad@redhat.com>
Reviewed-by: Richard Haines <richard_c_haines@btinternet.com>
Tested-by: Richard Haines <richard_c_haines@btinternet.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/security/SCTP.rst | 22 ++++++++++------------
 include/linux/lsm_hook_defs.h   |  2 ++
 include/linux/lsm_hooks.h       |  5 +++++
 include/linux/security.h        |  7 +++++++
 net/sctp/sm_statefuns.c         |  2 +-
 security/security.c             |  7 +++++++
 6 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst
index d5fd6ccc3dcb..406cc68b8808 100644
--- a/Documentation/security/SCTP.rst
+++ b/Documentation/security/SCTP.rst
@@ -15,10 +15,7 @@ For security module support, three SCTP specific hooks have been implemented::
     security_sctp_assoc_request()
     security_sctp_bind_connect()
     security_sctp_sk_clone()
-
-Also the following security hook has been utilised::
-
-    security_inet_conn_established()
+    security_sctp_assoc_established()
 
 The usage of these hooks are described below with the SELinux implementation
 described in the `SCTP SELinux Support`_ chapter.
@@ -122,11 +119,12 @@ calls **sctp_peeloff**\(3).
     @newsk - pointer to new sock structure.
 
 
-security_inet_conn_established()
+security_sctp_assoc_established()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Called when a COOKIE ACK is received::
+Called when a COOKIE ACK is received, and the peer secid will be
+saved into ``@asoc->peer_secid`` for client::
 
-    @sk  - pointer to sock structure.
+    @asoc - pointer to sctp association structure.
     @skb - pointer to skbuff of the COOKIE ACK packet.
 
 
@@ -134,7 +132,7 @@ Security Hooks used for Association Establishment
 -------------------------------------------------
 
 The following diagram shows the use of ``security_sctp_bind_connect()``,
-``security_sctp_assoc_request()``, ``security_inet_conn_established()`` when
+``security_sctp_assoc_request()``, ``security_sctp_assoc_established()`` when
 establishing an association.
 ::
 
@@ -172,7 +170,7 @@ establishing an association.
           <------------------------------------------- COOKIE ACK
           |                                               |
     sctp_sf_do_5_1E_ca                                    |
- Call security_inet_conn_established()                    |
+ Call security_sctp_assoc_established()                   |
  to set the peer label.                                   |
           |                                               |
           |                               If SCTP_SOCKET_TCP or peeled off
@@ -198,7 +196,7 @@ hooks with the SELinux specifics expanded below::
     security_sctp_assoc_request()
     security_sctp_bind_connect()
     security_sctp_sk_clone()
-    security_inet_conn_established()
+    security_sctp_assoc_established()
 
 
 security_sctp_assoc_request()
@@ -271,12 +269,12 @@ sockets sid and peer sid to that contained in the ``@asoc sid`` and
     @newsk - pointer to new sock structure.
 
 
-security_inet_conn_established()
+security_sctp_assoc_established()
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Called when a COOKIE ACK is received where it sets the connection's peer sid
 to that in ``@skb``::
 
-    @sk  - pointer to sock structure.
+    @asoc - pointer to sctp association structure.
     @skb - pointer to skbuff of the COOKIE ACK packet.
 
 
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index df8de62f4710..442a611fa0fb 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -335,6 +335,8 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
 	 struct sockaddr *address, int addrlen)
 LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
 	 struct sock *sk, struct sock *newsk)
+LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc,
+	 struct sk_buff *skb)
 #endif /* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d45b6f6e27fd..d6823214d5c1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1050,6 +1050,11 @@
  *	@asoc pointer to current sctp association structure.
  *	@sk pointer to current sock structure.
  *	@newsk pointer to new sock structure.
+ * @sctp_assoc_established:
+ *	Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet
+ *	to the security module.
+ *	@asoc pointer to sctp association structure.
+ *	@skb pointer to skbuff of association packet.
  *
  * Security hooks for Infiniband
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index bbf44a466832..06eac4e61a13 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1430,6 +1430,8 @@ int security_sctp_bind_connect(struct sock *sk, int optname,
 			       struct sockaddr *address, int addrlen);
 void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 			    struct sock *newsk);
+void security_sctp_assoc_established(struct sctp_association *asoc,
+				     struct sk_buff *skb);
 
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct sock *sock,
@@ -1649,6 +1651,11 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc,
 					  struct sock *newsk)
 {
 }
+
+static inline void security_sctp_assoc_established(struct sctp_association *asoc,
+						   struct sk_buff *skb)
+{
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index b818532c3fc2..5fabaa54b77d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -946,7 +946,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net,
 	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
 
 	/* Set peer label for connection. */
-	security_inet_conn_established(ep->base.sk, chunk->skb);
+	security_sctp_assoc_established((struct sctp_association *)asoc, chunk->skb);
 
 	/* RFC 2960 5.1 Normal Establishment of an Association
 	 *
diff --git a/security/security.c b/security/security.c
index c88167a414b4..779a9edea0a0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2388,6 +2388,13 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
 }
 EXPORT_SYMBOL(security_sctp_sk_clone);
 
+void security_sctp_assoc_established(struct sctp_association *asoc,
+				     struct sk_buff *skb)
+{
+	call_void_hook(sctp_assoc_established, asoc, skb);
+}
+EXPORT_SYMBOL(security_sctp_assoc_established);
+
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_INFINIBAND
-- 
cgit v1.2.3-59-g8ed1b


From aedddb4e45b34426cfbfa84454b6f203712733c5 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Tue, 2 Nov 2021 16:10:21 +0800
Subject: NFC: add necessary privilege flags in netlink layer

The CAP_NET_ADMIN checks are needed to prevent attackers faking a
device under NCIUARTSETDRIVER and exploit privileged commands.

This patch add GENL_ADMIN_PERM flags in genl_ops to fulfill the check.
Except for commands like NFC_CMD_GET_DEVICE, NFC_CMD_GET_TARGET,
NFC_CMD_LLC_GET_PARAMS, and NFC_CMD_GET_SE, which are mainly information-
read operations.

Signed-off-by: Lin Ma <linma@zju.edu.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/nfc/netlink.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 49089c50872e..334f63c9529e 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1664,31 +1664,37 @@ static const struct genl_ops nfc_genl_ops[] = {
 		.cmd = NFC_CMD_DEV_UP,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_dev_up,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_DEV_DOWN,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_dev_down,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_START_POLL,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_start_poll,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_STOP_POLL,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_stop_poll,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_DEP_LINK_UP,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_dep_link_up,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_DEP_LINK_DOWN,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_dep_link_down,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_GET_TARGET,
@@ -1706,26 +1712,31 @@ static const struct genl_ops nfc_genl_ops[] = {
 		.cmd = NFC_CMD_LLC_SET_PARAMS,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_llc_set_params,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_LLC_SDREQ,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_llc_sdreq,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_FW_DOWNLOAD,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_fw_download,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_ENABLE_SE,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_enable_se,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_DISABLE_SE,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_disable_se,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_GET_SE,
@@ -1737,21 +1748,25 @@ static const struct genl_ops nfc_genl_ops[] = {
 		.cmd = NFC_CMD_SE_IO,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_se_io,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_ACTIVATE_TARGET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_activate_target,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_VENDOR,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_vendor_cmd,
+		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NFC_CMD_DEACTIVATE_TARGET,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nfc_genl_deactivate_target,
+		.flags = GENL_ADMIN_PERM,
 	},
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From acaea0d5a63406c052444ad3a7cb54241adaf805 Mon Sep 17 00:00:00 2001
From: Zhang Mingyu <zhang.mingyu@zte.com.cn>
Date: Wed, 3 Nov 2021 06:46:17 +0000
Subject: net:ipv6:Remove unneeded semicolon

Eliminate the following coccinelle check warning:
net/ipv6/seg6.c:381:2-3

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Zhang Mingyu <zhang.mingyu@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 5daa1c3ed83b..a8b5784afb1a 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -378,7 +378,7 @@ static int __net_init seg6_net_init(struct net *net)
 		kfree(rcu_dereference_raw(sdata->tun_src));
 		kfree(sdata);
 		return -ENOMEM;
-	};
+	}
 #endif
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 9b65b17db72313b7a4fe9bc9502928c88be57986 Mon Sep 17 00:00:00 2001
From: Talal Ahmad <talalahmad@google.com>
Date: Tue, 2 Nov 2021 22:58:44 -0400
Subject: net: avoid double accounting for pure zerocopy skbs

Track skbs containing only zerocopy data and avoid charging them to
kernel memory to correctly account the memory utilization for
msg_zerocopy. All of the data in such skbs is held in user pages which
are already accounted to user. Before this change, they are charged
again in kernel in __zerocopy_sg_from_iter. The charging in kernel is
excessive because data is not being copied into skb frags. This
excessive charging can lead to kernel going into memory pressure
state which impacts all sockets in the system adversely. Mark pure
zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove
charge/uncharge for data in such skbs.

Initially, an skb is marked pure zerocopy when it is empty and in
zerocopy path. skb can then change from a pure zerocopy skb to mixed
data skb (zerocopy and copy data) if it is at tail of write queue and
there is room available in it and non-zerocopy data is being sent in
the next sendmsg call. At this time sk_mem_charge is done for the pure
zerocopied data and the pure zerocopy flag is unmarked. We found that
this happens very rarely on workloads that pass MSG_ZEROCOPY.

A pure zerocopy skb can later be coalesced into normal skb if they are
next to each other in queue but this patch prevents coalescing from
happening. This avoids complexity of charging when skb downgrades from
pure zerocopy to mixed. This is also rare.

In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge
for SKB_TRUESIZE(skb_end_offset(skb)) is done for sk_mem_charge in
tcp_skb_entail for an skb without data.

Testing with the msg_zerocopy.c benchmark between two hosts(100G nics)
with zerocopy showed that before this patch the 'sock' variable in
memory.stat for cgroup2 that tracks sum of sk_forward_alloc,
sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this
change it is 0. This is due to no charge to sk_forward_alloc for
zerocopy data and shows memory utilization for kernel is lowered.

With this commit we don't see the warning we saw in previous commit
which resulted in commit 84882cf72cd774cf16fd338bdbf00f69ac9f9194.

Signed-off-by: Talal Ahmad <talalahmad@google.com>
Acked-by: Arjun Roy <arjunroy@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 19 ++++++++++++++++++-
 include/net/tcp.h      |  8 ++++++--
 net/core/datagram.c    |  3 ++-
 net/core/skbuff.c      |  3 ++-
 net/ipv4/tcp.c         | 22 ++++++++++++++++++++--
 net/ipv4/tcp_output.c  |  7 +++++--
 6 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a63e13082397..686a666d073d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -454,9 +454,15 @@ enum {
 	 * all frags to avoid possible bad checksum
 	 */
 	SKBFL_SHARED_FRAG = BIT(1),
+
+	/* segment contains only zerocopy data and should not be
+	 * charged to the kernel memory.
+	 */
+	SKBFL_PURE_ZEROCOPY = BIT(2),
 };
 
 #define SKBFL_ZEROCOPY_FRAG	(SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
+#define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
 
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
@@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
+static inline bool skb_zcopy_pure(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
+}
+
+static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
+				       const struct sk_buff *skb2)
+{
+	return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
+}
+
 static inline void net_zcopy_get(struct ubuf_info *uarg)
 {
 	refcount_inc(&uarg->refcnt);
@@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 		if (!skb_zcopy_is_nouarg(skb))
 			uarg->callback(skb, uarg, zerocopy_success);
 
-		skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
+		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
 	}
 }
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70972f3ac8fa..4da22b41bde6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -293,7 +293,10 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk_wmem_queued_add(sk, -skb->truesize);
-	sk_mem_uncharge(sk, skb->truesize);
+	if (!skb_zcopy_pure(skb))
+		sk_mem_uncharge(sk, skb->truesize);
+	else
+		sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb)));
 	__kfree_skb(skb);
 }
 
@@ -974,7 +977,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
 					const struct sk_buff *from)
 {
 	return likely(tcp_skb_can_collapse_to(to) &&
-		      mptcp_skb_can_collapse(to, from));
+		      mptcp_skb_can_collapse(to, from) &&
+		      skb_pure_zcopy_same(to, from));
 }
 
 /* Events passed to congestion control interface */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 15ab9ffb27fe..ee290776c661 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -646,7 +646,8 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
 		skb->truesize += truesize;
 		if (sk && sk->sk_type == SOCK_STREAM) {
 			sk_wmem_queued_add(sk, truesize);
-			sk_mem_charge(sk, truesize);
+			if (!skb_zcopy_pure(skb))
+				sk_mem_charge(sk, truesize);
 		} else {
 			refcount_add(truesize, &skb->sk->sk_wmem_alloc);
 		}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3ec42cdee16a..ba2f38246f07 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3433,8 +3433,9 @@ static inline void skb_split_no_header(struct sk_buff *skb,
 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 {
 	int pos = skb_headlen(skb);
+	const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
 
-	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
+	skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
 	skb_zerocopy_clone(skb1, skb, 0);
 	if (len < pos)	/* Split line is inside header. */
 		skb_split_inside_header(skb, skb1, len, pos);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bc7f419184aa..b461ae573afc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -863,6 +863,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 	if (likely(skb)) {
 		bool mem_scheduled;
 
+		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 		if (force_schedule) {
 			mem_scheduled = true;
 			sk_forced_mem_schedule(sk, skb->truesize);
@@ -1319,6 +1320,15 @@ new_segment:
 
 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
 
+			/* skb changing from pure zc to mixed, must charge zc */
+			if (unlikely(skb_zcopy_pure(skb))) {
+				if (!sk_wmem_schedule(sk, skb->data_len))
+					goto wait_for_space;
+
+				sk_mem_charge(sk, skb->data_len);
+				skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+			}
+
 			if (!sk_wmem_schedule(sk, copy))
 				goto wait_for_space;
 
@@ -1339,8 +1349,16 @@ new_segment:
 			}
 			pfrag->offset += copy;
 		} else {
-			if (!sk_wmem_schedule(sk, copy))
-				goto wait_for_space;
+			/* First append to a fragless skb builds initial
+			 * pure zerocopy skb
+			 */
+			if (!skb->len)
+				skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
+
+			if (!skb_zcopy_pure(skb)) {
+				if (!sk_wmem_schedule(sk, copy))
+					goto wait_for_space;
+			}
 
 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
 			if (err == -EMSGSIZE || err == -EEXIST) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 76cc1641beb4..6f7860e283c6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	if (delta_truesize) {
 		skb->truesize	   -= delta_truesize;
 		sk_wmem_queued_add(sk, -delta_truesize);
-		sk_mem_uncharge(sk, delta_truesize);
+		if (!skb_zcopy_pure(skb))
+			sk_mem_uncharge(sk, delta_truesize);
 	}
 
 	/* Any change of skb->len requires recalculation of tso factor. */
@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
 		if (len <= skb->len)
 			break;
 
-		if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
+		if (unlikely(TCP_SKB_CB(skb)->eor) ||
+		    tcp_has_tx_tstamp(skb) ||
+		    !skb_pure_zcopy_same(skb, next))
 			return false;
 
 		len -= skb->len;
-- 
cgit v1.2.3-59-g8ed1b


From 1aabe578dd86e9f2867c4db4fba9a15f4ba1825d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 2 Nov 2021 15:02:36 -0700
Subject: ethtool: fix ethtool msg len calculation for pause stats

ETHTOOL_A_PAUSE_STAT_MAX is the MAX attribute id,
so we need to subtract non-stats and add one to
get a count (IOW -2+1 == -1).

Otherwise we'll see:

  ethnl cmd 21: calculated reply length 40, but consumed 52

Fixes: 9a27a33027f2 ("ethtool: add standard pause stats")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool_netlink.h      | 3 +++
 include/uapi/linux/ethtool_netlink.h | 4 +++-
 net/ethtool/pause.c                  | 3 +--
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 1e7bf78cb382..aba348d58ff6 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -10,6 +10,9 @@
 #define __ETHTOOL_LINK_MODE_MASK_NWORDS \
 	DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
 
+#define ETHTOOL_PAUSE_STAT_CNT	(__ETHTOOL_A_PAUSE_STAT_CNT -		\
+				 ETHTOOL_A_PAUSE_STAT_TX_FRAMES)
+
 enum ethtool_multicast_groups {
 	ETHNL_MCGRP_MONITOR,
 };
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index ca5fbb59fa42..999777d32dcf 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -411,7 +411,9 @@ enum {
 	ETHTOOL_A_PAUSE_STAT_TX_FRAMES,
 	ETHTOOL_A_PAUSE_STAT_RX_FRAMES,
 
-	/* add new constants above here */
+	/* add new constants above here
+	 * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats!
+	 */
 	__ETHTOOL_A_PAUSE_STAT_CNT,
 	ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1)
 };
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
index 9009f412151e..ee1e5806bc93 100644
--- a/net/ethtool/pause.c
+++ b/net/ethtool/pause.c
@@ -56,8 +56,7 @@ static int pause_reply_size(const struct ethnl_req_info *req_base,
 
 	if (req_base->flags & ETHTOOL_FLAG_STATS)
 		n += nla_total_size(0) +	/* _PAUSE_STATS */
-			nla_total_size_64bit(sizeof(u64)) *
-				(ETHTOOL_A_PAUSE_STAT_MAX - 2);
+		     nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT;
 	return n;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 250962e4684678629afd2feeaefdc40c5db501f4 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Wed, 3 Nov 2021 16:28:43 +0800
Subject: net: udp6: replace __UDP_INC_STATS() with __UDP6_INC_STATS()

__UDP_INC_STATS() is used in udpv6_queue_rcv_one_skb() when encap_rcv()
fails. __UDP6_INC_STATS() should be used here, so replace it with
__UDP6_INC_STATS().

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 12c12619ee35..e43b31d25fb6 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -700,9 +700,9 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 
 			ret = encap_rcv(sk, skb);
 			if (ret <= 0) {
-				__UDP_INC_STATS(sock_net(sk),
-						UDP_MIB_INDATAGRAMS,
-						is_udplite);
+				__UDP6_INC_STATS(sock_net(sk),
+						 UDP_MIB_INDATAGRAMS,
+						 is_udplite);
 				return -ret;
 			}
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 563bcbae3ba233c275c244bfce2efe12938f5363 Mon Sep 17 00:00:00 2001
From: Ziyang Xuan <william.xuanziyang@huawei.com>
Date: Tue, 2 Nov 2021 10:12:18 +0800
Subject: net: vlan: fix a UAF in vlan_dev_real_dev()

The real_dev of a vlan net_device may be freed after
unregister_vlan_dev(). Access the real_dev continually by
vlan_dev_real_dev() will trigger the UAF problem for the
real_dev like following:

==================================================================
BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
Call Trace:
 kasan_report.cold+0x83/0xdf
 vlan_dev_real_dev+0xf9/0x120
 is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
 is_eth_port_of_netdev_filter+0x28/0x40
 ib_enum_roce_netdev+0x1a3/0x300
 ib_enum_all_roce_netdevs+0xc7/0x140
 netdevice_event_work_handler+0x9d/0x210
...

Freed by task 9288:
 kasan_save_stack+0x1b/0x40
 kasan_set_track+0x1c/0x30
 kasan_set_free_info+0x20/0x30
 __kasan_slab_free+0xfc/0x130
 slab_free_freelist_hook+0xdd/0x240
 kfree+0xe4/0x690
 kvfree+0x42/0x50
 device_release+0x9f/0x240
 kobject_put+0x1c8/0x530
 put_device+0x1b/0x30
 free_netdev+0x370/0x540
 ppp_destroy_interface+0x313/0x3d0
...

Move the put_device(real_dev) to vlan_dev_free(). Ensure
real_dev not be freed before vlan_dev unregistered.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c     | 3 ---
 net/8021q/vlan_dev.c | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 55275ef9a31a..a3a0a5e994f5 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -123,9 +123,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	}
 
 	vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
-
-	/* Get rid of the vlan's reference to real_dev */
-	dev_put(real_dev);
 }
 
 int vlan_check_real_dev(struct net_device *real_dev,
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 90330b893134..ab6dee28536d 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -843,6 +843,9 @@ static void vlan_dev_free(struct net_device *dev)
 
 	free_percpu(vlan->vlan_pcpu_stats);
 	vlan->vlan_pcpu_stats = NULL;
+
+	/* Get rid of the vlan's reference to real_dev */
+	dev_put(vlan->real_dev);
 }
 
 void vlan_setup(struct net_device *dev)
-- 
cgit v1.2.3-59-g8ed1b


From 92f62485b3715882cd397b0cbd80a96d179b86d6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 2 Nov 2021 21:31:22 +0200
Subject: net: dsa: felix: fix broken VLAN-tagged PTP under VLAN-aware bridge

Normally it is expected that the dsa_device_ops :: rcv() method finishes
parsing the DSA tag and consumes it, then never looks at it again.

But commit c0bcf537667c ("net: dsa: ocelot: add hardware timestamping
support for Felix") added support for RX timestamping in a very
unconventional way. On this switch, a partial timestamp is available in
the DSA header, but the driver got away with not parsing that timestamp
right away, but instead delayed that parsing for a little longer:

dsa_switch_rcv():
	nskb = cpu_dp->rcv(skb, dev); <------------- not here
	-> ocelot_rcv()
	...

	skb = nskb;
	skb_push(skb, ETH_HLEN);
	skb->pkt_type = PACKET_HOST;
	skb->protocol = eth_type_trans(skb, skb->dev);

	...

	if (dsa_skb_defer_rx_timestamp(p, skb)) <--- but here
	-> felix_rxtstamp()
		return 0;

When in felix_rxtstamp(), this driver accounted for the fact that
eth_type_trans() happened in the meanwhile, so it got a hold of the
extraction header again by subtracting (ETH_HLEN + OCELOT_TAG_LEN) bytes
from the current skb->data.

This worked for quite some time but was quite fragile from the very
beginning. Not to mention that having DSA tag parsing split in two
different files, under different folders (net/dsa/tag_ocelot.c vs
drivers/net/dsa/ocelot/felix.c) made it quite non-obvious for patches to
come that they might break this.

Finally, the blamed commit does the following: at the end of
ocelot_rcv(), it checks whether the skb payload contains a VLAN header.
If it does, and this port is under a VLAN-aware bridge, that VLAN ID
might not be correct in the sense that the packet might have suffered
VLAN rewriting due to TCAM rules (VCAP IS1). So we consume the VLAN ID
from the skb payload using __skb_vlan_pop(), and take the classified
VLAN ID from the DSA tag, and construct a hwaccel VLAN tag with the
classified VLAN, and the skb payload is VLAN-untagged.

The big problem is that __skb_vlan_pop() does:

	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
	__skb_pull(skb, VLAN_HLEN);

aka it moves the Ethernet header 4 bytes to the right, and pulls 4 bytes
from the skb headroom (effectively also moving skb->data, by definition).
So for felix_rxtstamp()'s fragile logic, all bets are off now.
Instead of having the "extraction" pointer point to the DSA header,
it actually points to 4 bytes _inside_ the extraction header.
Corollary, the last 4 bytes of the "extraction" header are in fact 4
stale bytes of the destination MAC address from the Ethernet header,
from prior to the __skb_vlan_pop() movement.

So of course, RX timestamps are completely bogus when the system is
configured in this way.

The fix is actually very simple: just don't structure the code like that.
For better or worse, the DSA PTP timestamping API does not offer a
straightforward way for drivers to present their RX timestamps, but
other drivers (sja1105) have established a simple mechanism to carry
their RX timestamp from dsa_device_ops :: rcv() all the way to
dsa_switch_ops :: port_rxtstamp() and even later. That mechanism is to
simply save the partial timestamp to the skb->cb, and complete it later.

Question: why don't we simply populate the skb's struct
skb_shared_hwtstamps from ocelot_rcv(), and bother with this
complication of propagating the timestamp to felix_rxtstamp()?

Answer: dsa_switch_ops :: port_rxtstamp() answers the question whether
PTP packets need sleepable context to retrieve the full RX timestamp.
Currently felix_rxtstamp() answers "no, thanks" to that question, and
calls ocelot_ptp_gettime64() from softirq atomic context. This is
understandable, since Felix VSC9959 is a PCIe memory-mapped switch, so
hardware access does not require sleeping. But the felix driver is
preparing for the introduction of other switches where hardware access
is over a slow bus like SPI or MDIO:
https://lore.kernel.org/lkml/20210814025003.2449143-1-colin.foster@in-advantage.com/

So I would like to keep this code structure, so the rework needed when
that driver will need PTP support will be minimal (answer "yes, I need
deferred context for this skb's RX timestamp", then the partial
timestamp will still be found in the skb->cb.

Fixes: ea440cd2d9b2 ("net: dsa: tag_ocelot: use VLAN information from tagging header when available")
Reported-by: Po Liu <po.liu@nxp.com>
Cc: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c | 9 +++------
 include/linux/dsa/ocelot.h     | 1 +
 net/dsa/tag_ocelot.c           | 3 +++
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 83808e7dbdda..327cc4654806 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -1370,12 +1370,12 @@ out:
 static bool felix_rxtstamp(struct dsa_switch *ds, int port,
 			   struct sk_buff *skb, unsigned int type)
 {
-	u8 *extraction = skb->data - ETH_HLEN - OCELOT_TAG_LEN;
+	u32 tstamp_lo = OCELOT_SKB_CB(skb)->tstamp_lo;
 	struct skb_shared_hwtstamps *shhwtstamps;
 	struct ocelot *ocelot = ds->priv;
-	u32 tstamp_lo, tstamp_hi;
 	struct timespec64 ts;
-	u64 tstamp, val;
+	u32 tstamp_hi;
+	u64 tstamp;
 
 	/* If the "no XTR IRQ" workaround is in use, tell DSA to defer this skb
 	 * for RX timestamping. Then free it, and poll for its copy through
@@ -1390,9 +1390,6 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port,
 	ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
 	tstamp = ktime_set(ts.tv_sec, ts.tv_nsec);
 
-	ocelot_xfh_get_rew_val(extraction, &val);
-	tstamp_lo = (u32)val;
-
 	tstamp_hi = tstamp >> 32;
 	if ((tstamp & 0xffffffff) < tstamp_lo)
 		tstamp_hi--;
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index d42010cf5468..7ee708ad7df2 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -12,6 +12,7 @@
 struct ocelot_skb_cb {
 	struct sk_buff *clone;
 	unsigned int ptp_class; /* valid only for clones */
+	u32 tstamp_lo;
 	u8 ptp_cmd;
 	u8 ts_id;
 };
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index cd60b94fc175..de1c849a0a70 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -101,6 +101,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	struct dsa_port *dp;
 	u8 *extraction;
 	u16 vlan_tpid;
+	u64 rew_val;
 
 	/* Revert skb->data by the amount consumed by the DSA master,
 	 * so it points to the beginning of the frame.
@@ -130,6 +131,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	ocelot_xfh_get_qos_class(extraction, &qos_class);
 	ocelot_xfh_get_tag_type(extraction, &tag_type);
 	ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
+	ocelot_xfh_get_rew_val(extraction, &rew_val);
 
 	skb->dev = dsa_master_find_slave(netdev, 0, src_port);
 	if (!skb->dev)
@@ -143,6 +145,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 
 	dsa_default_offload_fwd_mark(skb);
 	skb->priority = qos_class;
+	OCELOT_SKB_CB(skb)->tstamp_lo = rew_val;
 
 	/* Ocelot switches copy frames unmodified to the CPU. However, it is
 	 * possible for the user to request a VLAN modification through
-- 
cgit v1.2.3-59-g8ed1b


From 3b65abb8d8a650e50ff5448ac38992ef8a74c584 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <cdleonard@gmail.com>
Date: Thu, 4 Nov 2021 00:17:51 +0200
Subject: tcp: Use BIT() for OPTION_* constants

Extending these flags using the existing (1 << x) pattern triggers
complaints from checkpatch. Instead of ignoring checkpatch modify the
existing values to use BIT(x) style in a separate commit.

Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6f7860e283c6..2e6e5a70168e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -408,13 +408,13 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 	return tp->snd_una != tp->snd_up;
 }
 
-#define OPTION_SACK_ADVERTISE	(1 << 0)
-#define OPTION_TS		(1 << 1)
-#define OPTION_MD5		(1 << 2)
-#define OPTION_WSCALE		(1 << 3)
-#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
-#define OPTION_SMC		(1 << 9)
-#define OPTION_MPTCP		(1 << 10)
+#define OPTION_SACK_ADVERTISE	BIT(0)
+#define OPTION_TS		BIT(1)
+#define OPTION_MD5		BIT(2)
+#define OPTION_WSCALE		BIT(3)
+#define OPTION_FAST_OPEN_COOKIE	BIT(8)
+#define OPTION_SMC		BIT(9)
+#define OPTION_MPTCP		BIT(10)
 
 static void smc_options_write(__be32 *ptr, u16 *options)
 {
-- 
cgit v1.2.3-59-g8ed1b


From d00c8ee31729248ba40b4ab25cd3b3b580c6f87c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 3 Nov 2021 16:49:11 -0700
Subject: net: fix possible NULL deref in sock_reserve_memory

Sanity check in sock_reserve_memory() was not enough to prevent malicious
user to trigger a NULL deref.

In this case, the isse is that sk_prot->memory_allocated is NULL.

Use standard sk_has_account() helper to deal with this.

BUG: KASAN: null-ptr-deref in instrument_atomic_read_write include/linux/instrumented.h:101 [inline]
BUG: KASAN: null-ptr-deref in atomic_long_add_return include/linux/atomic/atomic-instrumented.h:1218 [inline]
BUG: KASAN: null-ptr-deref in sk_memory_allocated_add include/net/sock.h:1371 [inline]
BUG: KASAN: null-ptr-deref in sock_reserve_memory net/core/sock.c:994 [inline]
BUG: KASAN: null-ptr-deref in sock_setsockopt+0x22ab/0x2b30 net/core/sock.c:1443
Write of size 8 at addr 0000000000000000 by task syz-executor.0/11270

CPU: 1 PID: 11270 Comm: syz-executor.0 Not tainted 5.15.0-syzkaller #0
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 __kasan_report mm/kasan/report.c:446 [inline]
 kasan_report.cold+0x66/0xdf mm/kasan/report.c:459
 check_region_inline mm/kasan/generic.c:183 [inline]
 kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189
 instrument_atomic_read_write include/linux/instrumented.h:101 [inline]
 atomic_long_add_return include/linux/atomic/atomic-instrumented.h:1218 [inline]
 sk_memory_allocated_add include/net/sock.h:1371 [inline]
 sock_reserve_memory net/core/sock.c:994 [inline]
 sock_setsockopt+0x22ab/0x2b30 net/core/sock.c:1443
 __sys_setsockopt+0x4f8/0x610 net/socket.c:2172
 __do_sys_setsockopt net/socket.c:2187 [inline]
 __se_sys_setsockopt net/socket.c:2184 [inline]
 __x64_sys_setsockopt+0xba/0x150 net/socket.c:2184
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f56076d5ae9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f5604c4b188 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00007f56077e8f60 RCX: 00007f56076d5ae9
RDX: 0000000000000049 RSI: 0000000000000001 RDI: 0000000000000003
RBP: 00007f560772ff25 R08: 000000000000fec7 R09: 0000000000000000
R10: 0000000020000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffb61a100f R14: 00007f5604c4b300 R15: 0000000000022000
 </TASK>

Fixes: 2bb2f5fb21b0 ("net: add new socket option SO_RESERVE_MEM")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 9862eefce21e..8f2b2f2c0e7b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -976,7 +976,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 	bool charged;
 	int pages;
 
-	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg)
+	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
 		return -EOPNOTSUPP;
 
 	if (!bytes)
-- 
cgit v1.2.3-59-g8ed1b


From 96d0c9be432dfd4908e96dde7cab860368a348ab Mon Sep 17 00:00:00 2001
From: Guo Zhengkui <guozhengkui@vivo.com>
Date: Wed, 3 Nov 2021 20:16:06 +0800
Subject: devlink: fix flexible_array.cocci warning

Fix following coccicheck warning:
./net/core/devlink.c:69:6-10: WARNING use flexible-array member instead

Signed-off-by: Guo Zhengkui <guozhengkui@vivo.com>
Link: https://lore.kernel.org/r/20211103121607.27490-1-guozhengkui@vivo.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/devlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 6b5ee862429e..5ba4f9434acd 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -66,7 +66,7 @@ struct devlink {
 	u8 reload_failed:1;
 	refcount_t refcount;
 	struct completion comp;
-	char priv[0] __aligned(NETDEV_ALIGN);
+	char priv[] __aligned(NETDEV_ALIGN);
 };
 
 void *devlink_priv(struct devlink *devlink)
-- 
cgit v1.2.3-59-g8ed1b


From 1e4b50f06d970d8da3474d2a0354450416710bda Mon Sep 17 00:00:00 2001
From: Eugene Syromiatnikov <esyr@redhat.com>
Date: Wed, 3 Nov 2021 20:09:42 +0100
Subject: mctp: handle the struct sockaddr_mctp padding fields

In order to have the padding fields actually usable in the future,
there have to be checks that user space doesn't supply non-zero garbage
there.  It is also worth setting these padding fields to zero, unless
it is known that they have been already zeroed.

Cc: stable@vger.kernel.org # v5.15
Fixes: 5a20dd46b8b84593 ("mctp: Be explicit about struct sockaddr_mctp padding")
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
Acked-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mctp/af_mctp.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index d344b02a1cde..bc88159f8844 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -33,6 +33,12 @@ static int mctp_release(struct socket *sock)
 	return 0;
 }
 
+/* Generic sockaddr checks, padding checks only so far */
+static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr)
+{
+	return !addr->__smctp_pad0 && !addr->__smctp_pad1;
+}
+
 static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 {
 	struct sock *sk = sock->sk;
@@ -52,6 +58,9 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 	/* it's a valid sockaddr for MCTP, cast and do protocol checks */
 	smctp = (struct sockaddr_mctp *)addr;
 
+	if (!mctp_sockaddr_is_ok(smctp))
+		return -EINVAL;
+
 	lock_sock(sk);
 
 	/* TODO: allow rebind */
@@ -87,6 +96,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 			return -EINVAL;
 		if (addr->smctp_family != AF_MCTP)
 			return -EINVAL;
+		if (!mctp_sockaddr_is_ok(addr))
+			return -EINVAL;
 		if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER))
 			return -EINVAL;
 
@@ -198,11 +209,13 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 		addr = msg->msg_name;
 		addr->smctp_family = AF_MCTP;
+		addr->__smctp_pad0 = 0;
 		addr->smctp_network = cb->net;
 		addr->smctp_addr.s_addr = hdr->src;
 		addr->smctp_type = type;
 		addr->smctp_tag = hdr->flags_seq_tag &
 					(MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
+		addr->__smctp_pad1 = 0;
 		msg->msg_namelen = sizeof(*addr);
 
 		if (msk->addr_ext) {
-- 
cgit v1.2.3-59-g8ed1b


From e9ea574ec1c27e555e7f78cbbcd28af91889d529 Mon Sep 17 00:00:00 2001
From: Eugene Syromiatnikov <esyr@redhat.com>
Date: Wed, 3 Nov 2021 20:09:46 +0100
Subject: mctp: handle the struct sockaddr_mctp_ext padding field

struct sockaddr_mctp_ext.__smctp_paddin0 has to be checked for being set
to zero, otherwise it cannot be utilised in the future.

Fixes: 99ce45d5e7dbde39 ("mctp: Implement extended addressing")
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
Acked-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mctp/af_mctp.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index bc88159f8844..871cf6266125 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -39,6 +39,13 @@ static bool mctp_sockaddr_is_ok(const struct sockaddr_mctp *addr)
 	return !addr->__smctp_pad0 && !addr->__smctp_pad1;
 }
 
+static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr)
+{
+	return !addr->__smctp_pad0[0] &&
+	       !addr->__smctp_pad0[1] &&
+	       !addr->__smctp_pad0[2];
+}
+
 static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 {
 	struct sock *sk = sock->sk;
@@ -135,7 +142,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		DECLARE_SOCKADDR(struct sockaddr_mctp_ext *,
 				 extaddr, msg->msg_name);
 
-		if (extaddr->smctp_halen > sizeof(cb->haddr)) {
+		if (!mctp_sockaddr_ext_is_ok(extaddr) ||
+		    extaddr->smctp_halen > sizeof(cb->haddr)) {
 			rc = -EINVAL;
 			goto err_free;
 		}
@@ -224,6 +232,7 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			msg->msg_namelen = sizeof(*ae);
 			ae->smctp_ifindex = cb->ifindex;
 			ae->smctp_halen = cb->halen;
+			memset(ae->__smctp_pad0, 0x0, sizeof(ae->__smctp_pad0));
 			memset(ae->smctp_haddr, 0x0, sizeof(ae->smctp_haddr));
 			memcpy(ae->smctp_haddr, cb->haddr, cb->halen);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From af1877b6cad16bdd8d8d93ca1c7b37e8f21ef4e3 Mon Sep 17 00:00:00 2001
From: Tony Lu <tonylu@linux.alibaba.com>
Date: Wed, 3 Nov 2021 20:48:37 +0800
Subject: net/smc: Print function name in smcr_link_down tracepoint

This makes the output of smcr_link_down tracepoint easier to use and
understand without additional translating function's pointer address.

It prints the function name with offset:

  <idle>-0       [000] ..s.    69.087164: smcr_link_down: lnk=00000000dab41cdc lgr=000000007d5d8e24 state=0 rc=1 dev=mlx5_0 location=smc_wr_tx_tasklet_fn+0x5ef/0x6f0 [smc]

Link: https://lore.kernel.org/netdev/11f17a34-fd35-f2ec-3f20-dd0c34e55fde@linux.ibm.com/
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Reviewed-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h
index b4c36795a928..ec17f29646f5 100644
--- a/net/smc/smc_tracepoint.h
+++ b/net/smc/smc_tracepoint.h
@@ -99,7 +99,7 @@ TRACE_EVENT(smcr_link_down,
 			   __entry->location = location;
 	    ),
 
-	    TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p",
+	    TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%pS",
 		      __entry->lnk, __entry->lgr,
 		      __entry->state, __get_str(name),
 		      __entry->location)
-- 
cgit v1.2.3-59-g8ed1b


From 70bf363d7adb3a428773bc905011d0ff923ba747 Mon Sep 17 00:00:00 2001
From: Nghia Le <nghialm78@gmail.com>
Date: Thu, 4 Nov 2021 21:37:40 +0700
Subject: ipv6: remove useless assignment to newinet in tcp_v6_syn_recv_sock()

The newinet value is initialized with inet_sk() in a block code to
handle sockets for the ETH_P_IP protocol. Along this code path,
newinet is never read. Thus, assignment to newinet is needless and
can be removed.

Signed-off-by: Nghia Le <nghialm78@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20211104143740.32446-1-nghialm78@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/tcp_ipv6.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2cc9b0e53ad1..551fce49841d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1263,7 +1263,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 
 		inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);
 
-		newinet = inet_sk(newsk);
 		newnp = tcp_inet6_sk(newsk);
 		newtp = tcp_sk(newsk);
 
-- 
cgit v1.2.3-59-g8ed1b


From c0f49d98006f2db3333b917caac65bce2af9865c Mon Sep 17 00:00:00 2001
From: Zhang Changzhong <zhangchangzhong@huawei.com>
Date: Thu, 28 Oct 2021 22:38:25 +0800
Subject: can: j1939: j1939_tp_cmd_recv(): ignore abort message in the BAM
 transport

This patch prevents BAM transport from being closed by receiving abort
message, as specified in SAE-J1939-82 2015 (A.3.3 Row 4).

Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Link: https://lore.kernel.org/all/1635431907-15617-2-git-send-email-zhangchangzhong@huawei.com
Cc: stable@vger.kernel.org
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/j1939/transport.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 6c0a0ebdd024..05eb3d059e17 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -2085,6 +2085,12 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
 		break;
 
 	case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+		if (j1939_cb_is_broadcast(skcb)) {
+			netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n",
+					__func__, skcb->addr.sa);
+			return;
+		}
+
 		if (j1939_tp_im_transmitter(skcb))
 			j1939_xtp_rx_abort(priv, skb, true);
 
-- 
cgit v1.2.3-59-g8ed1b


From a79305e156db3d24fcd8eb649cdb3c3b2350e5c2 Mon Sep 17 00:00:00 2001
From: Zhang Changzhong <zhangchangzhong@huawei.com>
Date: Thu, 28 Oct 2021 22:38:26 +0800
Subject: can: j1939: j1939_can_recv(): ignore messages with invalid source
 address

According to SAE-J1939-82 2015 (A.3.6 Row 2), a receiver should never
send TP.CM_CTS to the global address, so we can add a check in
j1939_can_recv() to drop messages with invalid source address.

Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Link: https://lore.kernel.org/all/1635431907-15617-3-git-send-email-zhangchangzhong@huawei.com
Cc: stable@vger.kernel.org
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/j1939/main.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index 9bc55ecb37f9..8452b0fbb78c 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -75,6 +75,13 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data)
 	skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX;
 	/* set default message type */
 	skcb->addr.type = J1939_TP;
+
+	if (!j1939_address_is_valid(skcb->addr.sa)) {
+		netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n",
+				__func__);
+		goto done;
+	}
+
 	if (j1939_pgn_is_pdu1(skcb->addr.pgn)) {
 		/* Type 1: with destination address */
 		skcb->addr.da = skcb->addr.pgn;
-- 
cgit v1.2.3-59-g8ed1b


From 164051a6ab5445bd97f719f50b16db8b32174269 Mon Sep 17 00:00:00 2001
From: Zhang Changzhong <zhangchangzhong@huawei.com>
Date: Thu, 28 Oct 2021 22:38:27 +0800
Subject: can: j1939: j1939_tp_cmd_recv(): check the dst address of TP.CM_BAM

The TP.CM_BAM message must be sent to the global address [1], so add a
check to drop TP.CM_BAM sent to a non-global address.

Without this patch, the receiver will treat the following packets as
normal RTS/CTS transport:
18EC0102#20090002FF002301
18EB0102#0100000000000000
18EB0102#020000FFFFFFFFFF

[1] SAE-J1939-82 2015 A.3.3 Row 1.

Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol")
Link: https://lore.kernel.org/all/1635431907-15617-4-git-send-email-zhangchangzhong@huawei.com
Cc: stable@vger.kernel.org
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/j1939/transport.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 05eb3d059e17..a271688780a2 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -2023,6 +2023,11 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
 		extd = J1939_ETP;
 		fallthrough;
 	case J1939_TP_CMD_BAM:
+		if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) {
+			netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n",
+					__func__, skcb->addr.sa);
+			return;
+		}
 		fallthrough;
 	case J1939_TP_CMD_RTS:
 		if (skcb->addr.type != extd)
-- 
cgit v1.2.3-59-g8ed1b


From e7ea51cd879c8214a824717d28a169b5f2262c02 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Fri, 5 Nov 2021 20:30:27 +0300
Subject: sctp: remove unreachable code from sctp_sf_violation_chunk()

sctp_sf_violation_chunk() is not called with asoc argument equal to NULL,
but if that happens it would lead to NULL pointer dereference
in sctp_vtag_verify().

The patch removes code that handles NULL asoc in sctp_sf_violation_chunk().

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Proposed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_statefuns.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5fabaa54b77d..39ba82ee87ce 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4898,9 +4898,6 @@ static enum sctp_disposition sctp_sf_violation_chunk(
 {
 	static const char err_str[] = "The following chunk violates protocol:";
 
-	if (!asoc)
-		return sctp_sf_violation(net, ep, asoc, type, arg, commands);
-
 	return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
 				       sizeof(err_str));
 }
-- 
cgit v1.2.3-59-g8ed1b


From 40a34121ac1dc52ed9cd34a8f4e48e32517a52fd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:32 -0700
Subject: bpf, sockmap: Use stricter sk state checks in sk_lookup_assign

In order to fix an issue with sockets in TCP sockmap redirect cases we plan
to allow CLOSE state sockets to exist in the sockmap. However, the check in
bpf_sk_lookup_assign() currently only invalidates sockets in the
TCP_ESTABLISHED case relying on the checks on sockmap insert to ensure we
never SOCK_CLOSE state sockets in the map.

To prepare for this change we flip the logic in bpf_sk_lookup_assign() to
explicitly test for the accepted cases. Namely, a tcp socket in TCP_LISTEN
or a udp socket in TCP_CLOSE state. This also makes the code more resilent
to future changes.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-2-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 12 ++++++++++++
 net/core/filter.c     |  6 ++++--
 net/core/sock_map.c   |  6 ------
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index b4256847c707..584d94be9c8b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -507,6 +507,18 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
+static inline bool sk_is_tcp(const struct sock *sk)
+{
+	return sk->sk_type == SOCK_STREAM &&
+	       sk->sk_protocol == IPPROTO_TCP;
+}
+
+static inline bool sk_is_udp(const struct sock *sk)
+{
+	return sk->sk_type == SOCK_DGRAM &&
+	       sk->sk_protocol == IPPROTO_UDP;
+}
+
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e8d3b49c297..a68418268e92 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10423,8 +10423,10 @@ BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
 		return -EINVAL;
 	if (unlikely(sk && sk_is_refcounted(sk)))
 		return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
-	if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED))
-		return -ESOCKTNOSUPPORT; /* reject connected sockets */
+	if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
+		return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
+	if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
+		return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
 
 	/* Check if socket is suitable for packet L3/L4 protocol */
 	if (sk && sk->sk_protocol != ctx->protocol)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index e252b8ec2b85..f39ef79ced67 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -511,12 +511,6 @@ static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
 	       ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
 }
 
-static bool sk_is_tcp(const struct sock *sk)
-{
-	return sk->sk_type == SOCK_STREAM &&
-	       sk->sk_protocol == IPPROTO_TCP;
-}
-
 static bool sock_map_redirect_allowed(const struct sock *sk)
 {
 	if (sk_is_tcp(sk))
-- 
cgit v1.2.3-59-g8ed1b


From b8b8315e39ffaca82e79d86dde26e9144addf66b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:33 -0700
Subject: bpf, sockmap: Remove unhash handler for BPF sockmap usage

We do not need to handle unhash from BPF side we can simply wait for the
close to happen. The original concern was a socket could transition from
ESTABLISHED state to a new state while the BPF hook was still attached.
But, we convinced ourself this is no longer possible and we also improved
BPF sockmap to handle listen sockets so this is no longer a problem.

More importantly though there are cases where unhash is called when data is
in the receive queue. The BPF unhash logic will flush this data which is
wrong. To be correct it should keep the data in the receive queue and allow
a receiving application to continue reading the data. This may happen when
tcp_abort() is received for example. Instead of complicating the logic in
unhash simply moving all this to tcp_close() hook solves this.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jussi Maki <joamaki@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-3-john.fastabend@gmail.com
---
 net/ipv4/tcp_bpf.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 5f4d6f45d87f..246f725b78c9 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -475,7 +475,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 				   struct proto *base)
 {
 	prot[TCP_BPF_BASE]			= *base;
-	prot[TCP_BPF_BASE].unhash		= sock_map_unhash;
 	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
 	prot[TCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
-- 
cgit v1.2.3-59-g8ed1b


From c5d2177a72a1659554922728fc407f59950aa929 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:34 -0700
Subject: bpf, sockmap: Fix race in ingress receive verdict with redirect to
 self

A socket in a sockmap may have different combinations of programs attached
depending on configuration. There can be no programs in which case the socket
acts as a sink only. There can be a TX program in this case a BPF program is
attached to sending side, but no RX program is attached. There can be an RX
program only where sends have no BPF program attached, but receives are hooked
with BPF. And finally, both TX and RX programs may be attached. Giving us the
permutations:

 None, Tx, Rx, and TxRx

To date most of our use cases have been TX case being used as a fast datapath
to directly copy between local application and a userspace proxy. Or Rx cases
and TxRX applications that are operating an in kernel based proxy. The traffic
in the first case where we hook applications into a userspace application looks
like this:

  AppA  redirect   AppB
   Tx <-----------> Rx
   |                |
   +                +
   TCP <--> lo <--> TCP

In this case all traffic from AppA (after 3whs) is copied into the AppB
ingress queue and no traffic is ever on the TCP recieive_queue.

In the second case the application never receives, except in some rare error
cases, traffic on the actual user space socket. Instead the send happens in
the kernel.

           AppProxy       socket pool
       sk0 ------------->{sk1,sk2, skn}
        ^                      |
        |                      |
        |                      v
       ingress              lb egress
       TCP                  TCP

Here because traffic is never read off the socket with userspace recv() APIs
there is only ever one reader on the sk receive_queue. Namely the BPF programs.

However, we've started to introduce a third configuration where the BPF program
on receive should process the data, but then the normal case is to push the
data into the receive queue of AppB.

       AppB
       recv()                (userspace)
     -----------------------
       tcp_bpf_recvmsg()     (kernel)
         |             |
         |             |
         |             |
       ingress_msgQ    |
         |             |
       RX_BPF          |
         |             |
         v             v
       sk->receive_queue

This is different from the App{A,B} redirect because traffic is first received
on the sk->receive_queue.

Now for the issue. The tcp_bpf_recvmsg() handler first checks the ingress_msg
queue for any data handled by the BPF rx program and returned with PASS code
so that it was enqueued on the ingress msg queue. Then if no data exists on
that queue it checks the socket receive queue. Unfortunately, this is the same
receive_queue the BPF program is reading data off of. So we get a race. Its
possible for the recvmsg() hook to pull data off the receive_queue before the
BPF hook has a chance to read it. It typically happens when an application is
banging on recv() and getting EAGAINs. Until they manage to race with the RX
BPF program.

To fix this we note that before this patch at attach time when the socket is
loaded into the map we check if it needs a TX program or just the base set of
proto bpf hooks. Then it uses the above general RX hook regardless of if we
have a BPF program attached at rx or not. This patch now extends this check to
handle all cases enumerated above, TX, RX, TXRX, and none. And to fix above
race when an RX program is attached we use a new hook that is nearly identical
to the old one except now we do not let the recv() call skip the RX BPF program.
Now only the BPF program pulls data from sk->receive_queue and recv() only
pulls data from the ingress msgQ post BPF program handling.

With this resolved our AppB from above has been up and running for many hours
without detecting any errors. We do this by correlating counters in RX BPF
events and the AppB to ensure data is never skipping the BPF program. Selftests,
was not able to detect this because we only run them for a short period of time
on well ordered send/recvs so we don't get any of the noise we see in real
application environments.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jussi Maki <joamaki@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-4-john.fastabend@gmail.com
---
 net/ipv4/tcp_bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 246f725b78c9..f70aa0932bd6 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -172,6 +172,41 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
 	return ret;
 }
 
+static int tcp_bpf_recvmsg_parser(struct sock *sk,
+				  struct msghdr *msg,
+				  size_t len,
+				  int nonblock,
+				  int flags,
+				  int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+	lock_sock(sk);
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = tcp_msg_wait_data(sk, psock, timeo);
+		if (data && !sk_psock_queue_empty(psock))
+			goto msg_bytes_ready;
+		copied = -EAGAIN;
+	}
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied;
+}
+
 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len)
 {
@@ -464,6 +499,8 @@ enum {
 enum {
 	TCP_BPF_BASE,
 	TCP_BPF_TX,
+	TCP_BPF_RX,
+	TCP_BPF_TXRX,
 	TCP_BPF_NUM_CFGS,
 };
 
@@ -482,6 +519,12 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
 	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
 	prot[TCP_BPF_TX].sendpage		= tcp_bpf_sendpage;
+
+	prot[TCP_BPF_RX]			= prot[TCP_BPF_BASE];
+	prot[TCP_BPF_RX].recvmsg		= tcp_bpf_recvmsg_parser;
+
+	prot[TCP_BPF_TXRX]			= prot[TCP_BPF_TX];
+	prot[TCP_BPF_TXRX].recvmsg		= tcp_bpf_recvmsg_parser;
 }
 
 static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
@@ -519,6 +562,10 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
 	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
 
+	if (psock->progs.stream_verdict || psock->progs.skb_verdict) {
+		config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX;
+	}
+
 	if (restore) {
 		if (inet_csk_has_ulp(sk)) {
 			/* TLS does not have an unhash proto in SW cases,
-- 
cgit v1.2.3-59-g8ed1b


From e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:35 -0700
Subject: bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and
 colliding

Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling
progress, e.g. offset and length of the skb. First this is poorly named and
inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at
this layer.

But, more importantly strparser is using the following to access its metadata.

  (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data))

Where _strp_msg is defined as:

  struct _strp_msg {
        struct strp_msg            strp;                 /*     0     8 */
        int                        accum_len;            /*     8     4 */

        /* size: 12, cachelines: 1, members: 2 */
        /* last cacheline: 12 bytes */
  };

So we use 12 bytes of ->data[] in struct. However in BPF code running parser
and verdict the user has read capabilities into the data[] array as well. Its
not too problematic, but we should not be exposing internal state to BPF
program. If its really needed then we can use the probe_read() APIs which allow
reading kernel memory. And I don't believe cb[] layer poses any API breakage by
moving this around because programs can't depend on cb[] across layers.

In order to fix another issue with a ctx rewrite we need to stash a temp
variable somewhere. To make this work cleanly this patch builds a cb struct
for sk_skb types called sk_skb_cb struct. Then we can use this consistently
in the strparser, sockmap space. Additionally we can start allowing ->cb[]
write access after this.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jussi Maki <joamaki@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com
---
 include/net/strparser.h   | 16 +++++++++++++++-
 net/core/filter.c         | 22 ++++++++++++++++++++++
 net/strparser/strparser.c | 10 +---------
 3 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index 1d20b98493a1..bec1439bd3be 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -54,10 +54,24 @@ struct strp_msg {
 	int offset;
 };
 
+struct _strp_msg {
+	/* Internal cb structure. struct strp_msg must be first for passing
+	 * to upper layer.
+	 */
+	struct strp_msg strp;
+	int accum_len;
+};
+
+struct sk_skb_cb {
+#define SK_SKB_CB_PRIV_LEN 20
+	unsigned char data[SK_SKB_CB_PRIV_LEN];
+	struct _strp_msg strp;
+};
+
 static inline struct strp_msg *strp_msg(struct sk_buff *skb)
 {
 	return (struct strp_msg *)((void *)skb->cb +
-		offsetof(struct qdisc_skb_cb, data));
+		offsetof(struct sk_skb_cb, strp));
 }
 
 /* Structure for an attached lower socket */
diff --git a/net/core/filter.c b/net/core/filter.c
index a68418268e92..c3936d0724b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9782,11 +9782,33 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
 	switch (si->off) {
 	case offsetof(struct __sk_buff, data_end):
 		insn = bpf_convert_data_end_access(si, insn);
 		break;
+	case offsetof(struct __sk_buff, cb[0]) ...
+	     offsetofend(struct __sk_buff, cb[4]) - 1:
+		BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
+		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+			      offsetof(struct sk_skb_cb, data)) %
+			     sizeof(__u64));
+
+		prog->cb_access = 1;
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, cb[0]);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct sk_skb_cb, data);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+					      si->src_reg, off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+					      si->src_reg, off);
+		break;
+
+
 	default:
 		return bpf_convert_ctx_access(type, si, insn_buf, prog,
 					      target_size);
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 9c0343568d2a..1a72c67afed5 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -27,18 +27,10 @@
 
 static struct workqueue_struct *strp_wq;
 
-struct _strp_msg {
-	/* Internal cb structure. struct strp_msg must be first for passing
-	 * to upper layer.
-	 */
-	struct strp_msg strp;
-	int accum_len;
-};
-
 static inline struct _strp_msg *_strp_msg(struct sk_buff *skb)
 {
 	return (struct _strp_msg *)((void *)skb->cb +
-		offsetof(struct qdisc_skb_cb, data));
+		offsetof(struct sk_skb_cb, strp));
 }
 
 /* Lower lock held */
-- 
cgit v1.2.3-59-g8ed1b


From b2c4618162ec615a15883a804cce7e27afecfa58 Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Wed, 3 Nov 2021 13:47:36 -0700
Subject: bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg

The current conversion of skb->data_end reads like this:

  ; data_end = (void*)(long)skb->data_end;
   559: (79) r1 = *(u64 *)(r2 +200)   ; r1  = skb->data
   560: (61) r11 = *(u32 *)(r2 +112)  ; r11 = skb->len
   561: (0f) r1 += r11
   562: (61) r11 = *(u32 *)(r2 +116)
   563: (1f) r1 -= r11

But similar to the case in 84f44df664e9 ("bpf: sock_ops sk access may stomp
registers when dst_reg = src_reg"), the code will read an incorrect skb->len
when src == dst. In this case we end up generating this xlated code:

  ; data_end = (void*)(long)skb->data_end;
   559: (79) r1 = *(u64 *)(r1 +200)   ; r1  = skb->data
   560: (61) r11 = *(u32 *)(r1 +112)  ; r11 = (skb->data)->len
   561: (0f) r1 += r11
   562: (61) r11 = *(u32 *)(r1 +116)
   563: (1f) r1 -= r11

... where line 560 is the reading 4B of (skb->data + 112) instead of the
intended skb->len Here the skb pointer in r1 gets set to skb->data and the
later deref for skb->len ends up following skb->data instead of skb.

This fixes the issue similarly to the patch mentioned above by creating an
additional temporary variable and using to store the register when dst_reg =
src_reg. We name the variable bpf_temp_reg and place it in the cb context for
sk_skb. Then we restore from the temp to ensure nothing is lost.

Fixes: 16137b09a66f2 ("bpf: Compute data_end dynamically with JIT code")
Signed-off-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-6-john.fastabend@gmail.com
---
 include/net/strparser.h |  4 ++++
 net/core/filter.c       | 36 ++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/strparser.h b/include/net/strparser.h
index bec1439bd3be..732b7097d78e 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -66,6 +66,10 @@ struct sk_skb_cb {
 #define SK_SKB_CB_PRIV_LEN 20
 	unsigned char data[SK_SKB_CB_PRIV_LEN];
 	struct _strp_msg strp;
+	/* temp_reg is a temporary register used for bpf_convert_data_end_access
+	 * when dst_reg == src_reg.
+	 */
+	u64 temp_reg;
 };
 
 static inline struct strp_msg *strp_msg(struct sk_buff *skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index c3936d0724b8..e471c9b09670 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9756,22 +9756,46 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
 						    struct bpf_insn *insn)
 {
-	/* si->dst_reg = skb->data */
+	int reg;
+	int temp_reg_off = offsetof(struct sk_buff, cb) +
+			   offsetof(struct sk_skb_cb, temp_reg);
+
+	if (si->src_reg == si->dst_reg) {
+		/* We need an extra register, choose and save a register. */
+		reg = BPF_REG_9;
+		if (si->src_reg == reg || si->dst_reg == reg)
+			reg--;
+		if (si->src_reg == reg || si->dst_reg == reg)
+			reg--;
+		*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
+	} else {
+		reg = si->dst_reg;
+	}
+
+	/* reg = skb->data */
 	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
-			      si->dst_reg, si->src_reg,
+			      reg, si->src_reg,
 			      offsetof(struct sk_buff, data));
 	/* AX = skb->len */
 	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
 			      BPF_REG_AX, si->src_reg,
 			      offsetof(struct sk_buff, len));
-	/* si->dst_reg = skb->data + skb->len */
-	*insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+	/* reg = skb->data + skb->len */
+	*insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
 	/* AX = skb->data_len */
 	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
 			      BPF_REG_AX, si->src_reg,
 			      offsetof(struct sk_buff, data_len));
-	/* si->dst_reg = skb->data + skb->len - skb->data_len */
-	*insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+	/* reg = skb->data + skb->len - skb->data_len */
+	*insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
+
+	if (si->src_reg == si->dst_reg) {
+		/* Restore the saved register */
+		*insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
+		*insn++ = BPF_MOV64_REG(si->dst_reg, reg);
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
+	}
 
 	return insn;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6dc25401cba4d428328eade8ceae717633fdd702 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 8 Nov 2021 10:08:15 -0800
Subject: net/sched: sch_taprio: fix undefined behavior in ktime_mono_to_any

1) if q->tk_offset == TK_OFFS_MAX, then get_tcp_tstamp() calls
   ktime_mono_to_any() with out-of-bound value.

2) if q->tk_offset is changed in taprio_parse_clockid(),
   taprio_get_time() might also call ktime_mono_to_any()
   with out-of-bound value as sysbot found:

UBSAN: array-index-out-of-bounds in kernel/time/timekeeping.c:908:27
index 3 is out of range for type 'ktime_t *[3]'
CPU: 1 PID: 25668 Comm: kworker/u4:0 Not tainted 5.15.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: bat_events batadv_iv_send_outstanding_bat_ogm_packet
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151
 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291
 ktime_mono_to_any+0x1d4/0x1e0 kernel/time/timekeeping.c:908
 get_tcp_tstamp net/sched/sch_taprio.c:322 [inline]
 get_packet_txtime net/sched/sch_taprio.c:353 [inline]
 taprio_enqueue_one+0x5b0/0x1460 net/sched/sch_taprio.c:420
 taprio_enqueue+0x3b1/0x730 net/sched/sch_taprio.c:485
 dev_qdisc_enqueue+0x40/0x300 net/core/dev.c:3785
 __dev_xmit_skb net/core/dev.c:3869 [inline]
 __dev_queue_xmit+0x1f6e/0x3630 net/core/dev.c:4194
 batadv_send_skb_packet+0x4a9/0x5f0 net/batman-adv/send.c:108
 batadv_iv_ogm_send_to_if net/batman-adv/bat_iv_ogm.c:393 [inline]
 batadv_iv_ogm_emit net/batman-adv/bat_iv_ogm.c:421 [inline]
 batadv_iv_send_outstanding_bat_ogm_packet+0x6d7/0x8e0 net/batman-adv/bat_iv_ogm.c:1701
 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298
 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445
 kthread+0x405/0x4f0 kernel/kthread.c:327
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295

Fixes: 7ede7b03484b ("taprio: make clock reference conversions easier")
Fixes: 54002066100b ("taprio: Adjust timestamps for TCP packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vedang Patel <vedang.patel@intel.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Reviewed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Link: https://lore.kernel.org/r/20211108180815.1822479-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/sch_taprio.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 9ab068fa2672..377f896bdedc 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -95,18 +95,22 @@ static ktime_t sched_base_time(const struct sched_gate_list *sched)
 	return ns_to_ktime(sched->base_time);
 }
 
-static ktime_t taprio_get_time(struct taprio_sched *q)
+static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono)
 {
-	ktime_t mono = ktime_get();
+	/* This pairs with WRITE_ONCE() in taprio_parse_clockid() */
+	enum tk_offsets tk_offset = READ_ONCE(q->tk_offset);
 
-	switch (q->tk_offset) {
+	switch (tk_offset) {
 	case TK_OFFS_MAX:
 		return mono;
 	default:
-		return ktime_mono_to_any(mono, q->tk_offset);
+		return ktime_mono_to_any(mono, tk_offset);
 	}
+}
 
-	return KTIME_MAX;
+static ktime_t taprio_get_time(const struct taprio_sched *q)
+{
+	return taprio_mono_to_any(q, ktime_get());
 }
 
 static void taprio_free_sched_cb(struct rcu_head *head)
@@ -319,7 +323,7 @@ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
 		return 0;
 	}
 
-	return ktime_mono_to_any(skb->skb_mstamp_ns, q->tk_offset);
+	return taprio_mono_to_any(q, skb->skb_mstamp_ns);
 }
 
 /* There are a few scenarios where we will have to modify the txtime from
@@ -1352,6 +1356,7 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
 		}
 	} else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
 		int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+		enum tk_offsets tk_offset;
 
 		/* We only support static clockids and we don't allow
 		 * for it to be modified after the first init.
@@ -1366,22 +1371,24 @@ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
 
 		switch (clockid) {
 		case CLOCK_REALTIME:
-			q->tk_offset = TK_OFFS_REAL;
+			tk_offset = TK_OFFS_REAL;
 			break;
 		case CLOCK_MONOTONIC:
-			q->tk_offset = TK_OFFS_MAX;
+			tk_offset = TK_OFFS_MAX;
 			break;
 		case CLOCK_BOOTTIME:
-			q->tk_offset = TK_OFFS_BOOT;
+			tk_offset = TK_OFFS_BOOT;
 			break;
 		case CLOCK_TAI:
-			q->tk_offset = TK_OFFS_TAI;
+			tk_offset = TK_OFFS_TAI;
 			break;
 		default:
 			NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
 			err = -EINVAL;
 			goto out;
 		}
+		/* This pairs with READ_ONCE() in taprio_mono_to_any */
+		WRITE_ONCE(q->tk_offset, tk_offset);
 
 		q->clockid = clockid;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From c7cd82b90599fa10915f41e3dd9098a77d0aa7b6 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Date: Tue, 9 Nov 2021 00:15:02 +0000
Subject: vsock: prevent unnecessary refcnt inc for nonblocking connect

Currently vosck_connect() increments sock refcount for nonblocking
socket each time it's called, which can lead to memory leak if
it's called multiple times because connect timeout function decrements
sock refcount only once.

Fixes it by making vsock_connect() return -EALREADY immediately when
sock state is already SS_CONNECTING.

Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/af_vsock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7d851eb3a683..ed0df839c38c 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1322,6 +1322,8 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr,
 		 * non-blocking call.
 		 */
 		err = -EALREADY;
+		if (flags & O_NONBLOCK)
+			goto out;
 		break;
 	default:
 		if ((sk->sk_state == TCP_LISTEN) ||
-- 
cgit v1.2.3-59-g8ed1b


From e5d5aadcf3cd59949316df49c27cb21788d7efe4 Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Wed, 10 Nov 2021 15:02:34 +0800
Subject: net/smc: fix sk_refcnt underflow on linkdown and fallback

We got the following WARNING when running ab/nginx
test with RDMA link flapping (up-down-up).
The reason is when smc_sock fallback and at linkdown
happens simultaneously, we may got the following situation:

__smc_lgr_terminate()
 --> smc_conn_kill()
    --> smc_close_active_abort()
           smc_sock->sk_state = SMC_CLOSED
           sock_put(smc_sock)

smc_sock was set to SMC_CLOSED and sock_put() been called
when terminate the link group. But later application call
close() on the socket, then we got:

__smc_release():
    if (smc_sock->fallback)
        smc_sock->sk_state = SMC_CLOSED
        sock_put(smc_sock)

Again we set the smc_sock to CLOSED through it's already
in CLOSED state, and double put the refcnt, so the following
warning happens:

refcount_t: underflow; use-after-free.
WARNING: CPU: 5 PID: 860 at lib/refcount.c:28 refcount_warn_saturate+0x8d/0xf0
Modules linked in:
CPU: 5 PID: 860 Comm: nginx Not tainted 5.10.46+ #403
Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/01/2014
RIP: 0010:refcount_warn_saturate+0x8d/0xf0
Code: 05 5c 1e b5 01 01 e8 52 25 bc ff 0f 0b c3 80 3d 4f 1e b5 01 00 75 ad 48

RSP: 0018:ffffc90000527e50 EFLAGS: 00010286
RAX: 0000000000000026 RBX: ffff8881300df2c0 RCX: 0000000000000027
RDX: 0000000000000000 RSI: ffff88813bd58040 RDI: ffff88813bd58048
RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000001
R10: ffff8881300df2c0 R11: ffffc90000527c78 R12: ffff8881300df340
R13: ffff8881300df930 R14: ffff88810b3dad80 R15: ffff8881300df4f8
FS:  00007f739de8fb80(0000) GS:ffff88813bd40000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000000a01b008 CR3: 0000000111b64003 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 smc_release+0x353/0x3f0
 __sock_release+0x3d/0xb0
 sock_close+0x11/0x20
 __fput+0x93/0x230
 task_work_run+0x65/0xa0
 exit_to_user_mode_prepare+0xf9/0x100
 syscall_exit_to_user_mode+0x27/0x190
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

This patch adds check in __smc_release() to make
sure we won't do an extra sock_put() and set the
socket to CLOSED when its already in CLOSED state.

Fixes: 51f1de79ad8e (net/smc: replace sock_put worker by socket refcounting)
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0cf7ed2f5d41..59284da9116d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -149,14 +149,18 @@ static int __smc_release(struct smc_sock *smc)
 		sock_set_flag(sk, SOCK_DEAD);
 		sk->sk_shutdown |= SHUTDOWN_MASK;
 	} else {
-		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
-			sock_put(sk); /* passive closing */
-		if (sk->sk_state == SMC_LISTEN) {
-			/* wake up clcsock accept */
-			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+		if (sk->sk_state != SMC_CLOSED) {
+			if (sk->sk_state != SMC_LISTEN &&
+			    sk->sk_state != SMC_INIT)
+				sock_put(sk); /* passive closing */
+			if (sk->sk_state == SMC_LISTEN) {
+				/* wake up clcsock accept */
+				rc = kernel_sock_shutdown(smc->clcsock,
+							  SHUT_RDWR);
+			}
+			sk->sk_state = SMC_CLOSED;
+			sk->sk_state_change(sk);
 		}
-		sk->sk_state = SMC_CLOSED;
-		sk->sk_state_change(sk);
 		smc_restore_fallback_changes(smc);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 0315a075f1343966ea2d9a085666a88a69ea6a3d Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Wed, 10 Nov 2021 20:56:05 +0100
Subject: net: fix premature exit from NAPI state polling in napi_disable()

Commit 719c57197010 ("net: make napi_disable() symmetric with
enable") accidentally introduced a bug sometimes leading to a kernel
BUG when bringing an iface up/down under heavy traffic load.

Prior to this commit, napi_disable() was polling n->state until
none of (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC) is set and then
always flip them. Now there's a possibility to get away with the
NAPIF_STATE_SCHE unset as 'continue' drops us to the cmpxchg()
call with an uninitialized variable, rather than straight to
another round of the state check.

Error path looks like:

napi_disable():
unsigned long val, new; /* new is uninitialized */

do {
	val = READ_ONCE(n->state); /* NAPIF_STATE_NPSVC and/or
				      NAPIF_STATE_SCHED is set */
	if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { /* true */
		usleep_range(20, 200);
		continue; /* go straight to the condition check */
	}
	new = val | <...>
} while (cmpxchg(&n->state, val, new) != val); /* state == val, cmpxchg()
						  writes garbage */

napi_enable():
do {
	val = READ_ONCE(n->state);
	BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); /* 50/50 boom */
<...>

while the typical BUG splat is like:

[  172.652461] ------------[ cut here ]------------
[  172.652462] kernel BUG at net/core/dev.c:6937!
[  172.656914] invalid opcode: 0000 [#1] PREEMPT SMP PTI
[  172.661966] CPU: 36 PID: 2829 Comm: xdp_redirect_cp Tainted: G          I       5.15.0 #42
[  172.670222] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.02.01.0014.082620210524 08/26/2021
[  172.680646] RIP: 0010:napi_enable+0x5a/0xd0
[  172.684832] Code: 07 49 81 cc 00 01 00 00 4c 89 e2 48 89 d8 80 e6 fb f0 48 0f b1 55 10 48 39 c3 74 10 48 8b 5d 10 f6 c7 04 75 3d f6 c3 01 75 b4 <0f> 0b 5b 5d 41 5c c3 65 ff 05 b8 e5 61 53 48 c7 c6 c0 f3 34 ad 48
[  172.703578] RSP: 0018:ffffa3c9497477a8 EFLAGS: 00010246
[  172.708803] RAX: ffffa3c96615a014 RBX: 0000000000000000 RCX: ffff8a4b575301a0
< snip >
[  172.782403] Call Trace:
[  172.784857]  <TASK>
[  172.786963]  ice_up_complete+0x6f/0x210 [ice]
[  172.791349]  ice_xdp+0x136/0x320 [ice]
[  172.795108]  ? ice_change_mtu+0x180/0x180 [ice]
[  172.799648]  dev_xdp_install+0x61/0xe0
[  172.803401]  dev_xdp_attach+0x1e0/0x550
[  172.807240]  dev_change_xdp_fd+0x1e6/0x220
[  172.811338]  do_setlink+0xee8/0x1010
[  172.814917]  rtnl_setlink+0xe5/0x170
[  172.818499]  ? bpf_lsm_binder_set_context_mgr+0x10/0x10
[  172.823732]  ? security_capable+0x36/0x50
< snip >

Fix this by replacing 'do { } while (cmpxchg())' with an "infinite"
for-loop with an explicit break.

From v1 [0]:
 - just use a for-loop to simplify both the fix and the existing
   code (Eric).

[0] https://lore.kernel.org/netdev/20211110191126.1214-1-alexandr.lobakin@intel.com

Fixes: 719c57197010 ("net: make napi_disable() symmetric with enable")
Suggested-by: Eric Dumazet <edumazet@google.com> # for-loop
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20211110195605.1304-1-alexandr.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index edeb811c454e..15ac064b5562 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6928,7 +6928,7 @@ void napi_disable(struct napi_struct *n)
 	might_sleep();
 	set_bit(NAPI_STATE_DISABLE, &n->state);
 
-	do {
+	for ( ; ; ) {
 		val = READ_ONCE(n->state);
 		if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
 			usleep_range(20, 200);
@@ -6937,7 +6937,10 @@ void napi_disable(struct napi_struct *n)
 
 		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
 		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
-	} while (cmpxchg(&n->state, val, new) != val);
+
+		if (cmpxchg(&n->state, val, new) == val)
+			break;
+	}
 
 	hrtimer_cancel(&n->timer);
 
-- 
cgit v1.2.3-59-g8ed1b