From 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Sep 2021 17:00:29 -0700
Subject: net/af_unix: fix a data-race in unix_dgram_poll

syzbot reported another data-race in af_unix [1]

Lets change __skb_insert() to use WRITE_ONCE() when changing
skb head qlen.

Also, change unix_dgram_poll() to use lockless version
of unix_recvq_full()

It is verry possible we can switch all/most unix_recvq_full()
to the lockless version, this will be done in a future kernel version.

[1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1

BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll

write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0:
 __skb_insert include/linux/skbuff.h:1938 [inline]
 __skb_queue_before include/linux/skbuff.h:2043 [inline]
 __skb_queue_tail include/linux/skbuff.h:2076 [inline]
 skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264
 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850
 sock_sendmsg_nosec net/socket.c:703 [inline]
 sock_sendmsg net/socket.c:723 [inline]
 ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392
 ___sys_sendmsg net/socket.c:2446 [inline]
 __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532
 __do_sys_sendmmsg net/socket.c:2561 [inline]
 __se_sys_sendmmsg net/socket.c:2558 [inline]
 __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1:
 skb_queue_len include/linux/skbuff.h:1869 [inline]
 unix_recvq_full net/unix/af_unix.c:194 [inline]
 unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777
 sock_poll+0x23e/0x260 net/socket.c:1288
 vfs_poll include/linux/poll.h:90 [inline]
 ep_item_poll fs/eventpoll.c:846 [inline]
 ep_send_events fs/eventpoll.c:1683 [inline]
 ep_poll fs/eventpoll.c:1798 [inline]
 do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226
 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline]
 __se_sys_epoll_wait fs/eventpoll.c:2233 [inline]
 __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x0000001b -> 0x00000001

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G        W         5.14.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()")
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6bdb0db3e825..841e2f0f5240 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1940,7 +1940,7 @@ static inline void __skb_insert(struct sk_buff *newsk,
 	WRITE_ONCE(newsk->prev, prev);
 	WRITE_ONCE(next->prev, newsk);
 	WRITE_ONCE(prev->next, newsk);
-	list->qlen++;
+	WRITE_ONCE(list->qlen, list->qlen + 1);
 }
 
 static inline void __skb_queue_splice(const struct sk_buff_head *list,
-- 
cgit v1.2.3-59-g8ed1b


From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 9 Sep 2021 08:49:59 -0700
Subject: bpf, mm: Fix lockdep warning triggered by
 stack_map_get_build_id_offset()

Currently the bpf selftest "get_stack_raw_tp" triggered the warning:

  [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0
  [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod]
  [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G        W  O      5.14.0+ #53
  [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0
  [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8
  [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246
  [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000
  [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e
  [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe
  [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0
  [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0
  [ 1411.304501] FS:  00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000
  [ 1411.304504] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0
  [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [ 1411.304512] Call Trace:
  [ 1411.304517]  stack_map_get_build_id_offset+0x17c/0x260
  [ 1411.304528]  __bpf_get_stack+0x18f/0x230
  [ 1411.304541]  bpf_get_stack_raw_tp+0x5a/0x70
  [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000
  [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e
  [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04
  [ 1411.306978]  bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c
  [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0
  [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108
  [ 1411.309074]  bpf_trace_run2+0x8f/0x1a0
  [ 1411.309891] FS:  00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000
  [ 1411.309896] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [ 1411.311221]  syscall_trace_enter.isra.20+0x161/0x1f0
  [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0
  [ 1411.312291]  do_syscall_64+0x15/0x80
  [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [ 1411.313803]  entry_SYSCALL_64_after_hwframe+0x44/0xae
  [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [ 1411.315082] RIP: 0033:0x7f00ad80a0e0
  [ 1411.315626] Call Trace:
  [ 1411.315632]  stack_map_get_build_id_offset+0x17c/0x260

To reproduce, first build `test_progs` binary:

  make -C tools/testing/selftests/bpf -j60

and then run the binary at tools/testing/selftests/bpf directory:

  ./test_progs -t get_stack_raw_tp

The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked()
annotations to find_vma*()") which added mmap_assert_locked() in find_vma()
function. The mmap_assert_locked() function asserts that mm->mmap_lock needs
to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid()
helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner()
instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case,
the above warning is emitted during test run.

This patch fixed the issue by (1). using mmap_read_trylock() instead of
mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and
(2). droping lockdep for mmap_lock right before the irq_work_queue(). The
function mmap_read_trylock_non_owner() is also removed since after this
patch nobody calls it any more.

Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()")
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Luigi Rizzo <lrizzo@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com
---
 include/linux/mmap_lock.h |  9 ---------
 kernel/bpf/stackmap.c     | 10 ++++++++--
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 0540f0156f58..3af8f7fb067d 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -144,15 +144,6 @@ static inline void mmap_read_unlock(struct mm_struct *mm)
 	__mmap_lock_trace_released(mm, false);
 }
 
-static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
-{
-	if (mmap_read_trylock(mm)) {
-		rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
-		return true;
-	}
-	return false;
-}
-
 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
 {
 	up_read_non_owner(&mm->mmap_lock);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index e8eefdf8cf3e..09a3fd97d329 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	 * with build_id.
 	 */
 	if (!user || !current || !current->mm || irq_work_busy ||
-	    !mmap_read_trylock_non_owner(current->mm)) {
+	    !mmap_read_trylock(current->mm)) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	}
 
 	if (!work) {
-		mmap_read_unlock_non_owner(current->mm);
+		mmap_read_unlock(current->mm);
 	} else {
 		work->mm = current->mm;
+
+		/* The lock will be released once we're out of interrupt
+		 * context. Tell lockdep that we've released it now so
+		 * it doesn't complain that we forgot to release it.
+		 */
+		rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
 		irq_work_queue(&work->irq_work);
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 14 Sep 2021 01:07:57 +0200
Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode

Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used.
Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
embedded per-socket cgroup information into sock->sk_cgrp_data and in order
to save 8 bytes in struct sock made both mutually exclusive, that is, when
cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2
falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp).

The assumption made was "there is no reason to mix the two and this is in line
with how legacy and v2 compatibility is handled" as stated in bd1060a1d671.
However, with Kubernetes more widely supporting cgroups v2 as well nowadays,
this assumption no longer holds, and the possibility of the v1/v2 mixed mode
with the v2 root fallback being hit becomes a real security issue.

Many of the cgroup v2 BPF programs are also used for policy enforcement, just
to pick _one_ example, that is, to programmatically deny socket related system
calls like connect(2) or bind(2). A v2 root fallback would implicitly cause
a policy bypass for the affected Pods.

In production environments, we have recently seen this case due to various
circumstances: i) a different 3rd party agent and/or ii) a container runtime
such as [0] in the user's environment configuring legacy cgroup v1 net_cls
tags, which triggered implicitly mentioned root fallback. Another case is
Kubernetes projects like kind [1] which create Kubernetes nodes in a container
and also add cgroup namespaces to the mix, meaning programs which are attached
to the cgroup v2 root of the cgroup namespace get attached to a non-root
cgroup v2 path from init namespace point of view. And the latter's root is
out of reach for agents on a kind Kubernetes node to configure. Meaning, any
entity on the node setting cgroup v1 net_cls tag will trigger the bypass
despite cgroup v2 BPF programs attached to the namespace root.

Generally, this mutual exclusiveness does not hold anymore in today's user
environments and makes cgroup v2 usage from BPF side fragile and unreliable.
This fix adds proper struct cgroup pointer for the cgroup v2 case to struct
sock_cgroup_data in order to address these issues; this implicitly also fixes
the tradeoffs being made back then with regards to races and refcount leaks
as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF
programs always operate as expected.

  [0] https://github.com/nestybox/sysbox/
  [1] https://kind.sigs.k8s.io/

Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net
---
 include/linux/cgroup-defs.h  | 107 +++++++++++--------------------------------
 include/linux/cgroup.h       |  22 +--------
 kernel/cgroup/cgroup.c       |  50 ++++----------------
 net/core/netclassid_cgroup.c |   7 +--
 net/core/netprio_cgroup.c    |  10 +---
 5 files changed, 41 insertions(+), 155 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e1c705fdfa7c..db2e147e069f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
  * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains
  * per-socket cgroup information except for memcg association.
  *
- * On legacy hierarchies, net_prio and net_cls controllers directly set
- * attributes on each sock which can then be tested by the network layer.
- * On the default hierarchy, each sock is associated with the cgroup it was
- * created in and the networking layer can match the cgroup directly.
- *
- * To avoid carrying all three cgroup related fields separately in sock,
- * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer.
- * On boot, sock_cgroup_data records the cgroup that the sock was created
- * in so that cgroup2 matches can be made; however, once either net_prio or
- * net_cls starts being used, the area is overridden to carry prioidx and/or
- * classid.  The two modes are distinguished by whether the lowest bit is
- * set.  Clear bit indicates cgroup pointer while set bit prioidx and
- * classid.
- *
- * While userland may start using net_prio or net_cls at any time, once
- * either is used, cgroup2 matching no longer works.  There is no reason to
- * mix the two and this is in line with how legacy and v2 compatibility is
- * handled.  On mode switch, cgroup references which are already being
- * pointed to by socks may be leaked.  While this can be remedied by adding
- * synchronization around sock_cgroup_data, given that the number of leaked
- * cgroups is bound and highly unlikely to be high, this seems to be the
- * better trade-off.
+ * On legacy hierarchies, net_prio and net_cls controllers directly
+ * set attributes on each sock which can then be tested by the network
+ * layer. On the default hierarchy, each sock is associated with the
+ * cgroup it was created in and the networking layer can match the
+ * cgroup directly.
  */
 struct sock_cgroup_data {
-	union {
-#ifdef __LITTLE_ENDIAN
-		struct {
-			u8	is_data : 1;
-			u8	no_refcnt : 1;
-			u8	unused : 6;
-			u8	padding;
-			u16	prioidx;
-			u32	classid;
-		} __packed;
-#else
-		struct {
-			u32	classid;
-			u16	prioidx;
-			u8	padding;
-			u8	unused : 6;
-			u8	no_refcnt : 1;
-			u8	is_data : 1;
-		} __packed;
+	struct cgroup	*cgroup; /* v2 */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	u32		classid; /* v1 */
+#endif
+#ifdef CONFIG_CGROUP_NET_PRIO
+	u16		prioidx; /* v1 */
 #endif
-		u64		val;
-	};
 };
 
-/*
- * There's a theoretical window where the following accessors race with
- * updaters and return part of the previous pointer as the prioidx or
- * classid.  Such races are short-lived and the result isn't critical.
- */
 static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 {
-	/* fallback to 1 which is always the ID of the root cgroup */
-	return (skcd->is_data & 1) ? skcd->prioidx : 1;
+#ifdef CONFIG_CGROUP_NET_PRIO
+	return READ_ONCE(skcd->prioidx);
+#else
+	return 1;
+#endif
 }
 
 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
-	/* fallback to 0 which is the unconfigured default classid */
-	return (skcd->is_data & 1) ? skcd->classid : 0;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	return READ_ONCE(skcd->classid);
+#else
+	return 0;
+#endif
 }
 
-/*
- * If invoked concurrently, the updaters may clobber each other.  The
- * caller is responsible for synchronization.
- */
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
 {
-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
-	if (sock_cgroup_prioidx(&skcd_buf) == prioidx)
-		return;
-
-	if (!(skcd_buf.is_data & 1)) {
-		skcd_buf.val = 0;
-		skcd_buf.is_data = 1;
-	}
-
-	skcd_buf.prioidx = prioidx;
-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_PRIO
+	WRITE_ONCE(skcd->prioidx, prioidx);
+#endif
 }
 
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-	struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }};
-
-	if (sock_cgroup_classid(&skcd_buf) == classid)
-		return;
-
-	if (!(skcd_buf.is_data & 1)) {
-		skcd_buf.val = 0;
-		skcd_buf.is_data = 1;
-	}
-
-	skcd_buf.classid = classid;
-	WRITE_ONCE(skcd->val, skcd_buf.val);	/* see sock_cgroup_ptr() */
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	WRITE_ONCE(skcd->classid, classid);
+#endif
 }
 
 #else	/* CONFIG_SOCK_CGROUP_DATA */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7bf60454a313..75c151413fda 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task,
  */
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-extern spinlock_t cgroup_sk_update_lock;
-#endif
-
-void cgroup_sk_alloc_disable(void);
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
 void cgroup_sk_clone(struct sock_cgroup_data *skcd);
 void cgroup_sk_free(struct sock_cgroup_data *skcd);
 
 static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
 {
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-	unsigned long v;
-
-	/*
-	 * @skcd->val is 64bit but the following is safe on 32bit too as we
-	 * just need the lower ulong to be written and read atomically.
-	 */
-	v = READ_ONCE(skcd->val);
-
-	if (v & 3)
-		return &cgrp_dfl_root.cgrp;
-
-	return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp;
-#else
-	return (struct cgroup *)(unsigned long)skcd->val;
-#endif
+	return skcd->cgroup;
 }
 
 #else	/* CONFIG_CGROUP_DATA */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..8afa8690d288 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
  */
 #ifdef CONFIG_SOCK_CGROUP_DATA
 
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
-	if (cgroup_sk_alloc_disabled)
-		return;
-	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
-	cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled	false
-
-#endif
-
 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 {
-	if (cgroup_sk_alloc_disabled) {
-		skcd->no_refcnt = 1;
-		return;
-	}
-
 	/* Don't associate the sock with unrelated interrupted task's cgroup. */
 	if (in_interrupt())
 		return;
 
 	rcu_read_lock();
-
 	while (true) {
 		struct css_set *cset;
 
 		cset = task_css_set(current);
 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
-			skcd->val = (unsigned long)cset->dfl_cgrp;
+			skcd->cgroup = cset->dfl_cgrp;
 			cgroup_bpf_get(cset->dfl_cgrp);
 			break;
 		}
 		cpu_relax();
 	}
-
 	rcu_read_unlock();
 }
 
 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
 {
-	if (skcd->val) {
-		if (skcd->no_refcnt)
-			return;
-		/*
-		 * We might be cloning a socket which is left in an empty
-		 * cgroup and the cgroup might have already been rmdir'd.
-		 * Don't use cgroup_get_live().
-		 */
-		cgroup_get(sock_cgroup_ptr(skcd));
-		cgroup_bpf_get(sock_cgroup_ptr(skcd));
-	}
+	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+	/*
+	 * We might be cloning a socket which is left in an empty
+	 * cgroup and the cgroup might have already been rmdir'd.
+	 * Don't use cgroup_get_live().
+	 */
+	cgroup_get(cgrp);
+	cgroup_bpf_get(cgrp);
 }
 
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
 
-	if (skcd->no_refcnt)
-		return;
 	cgroup_bpf_put(cgrp);
 	cgroup_put(cgrp);
 }
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b49c57d35a88..1a6a86693b74 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n)
 	struct update_classid_context *ctx = (void *)v;
 	struct socket *sock = sock_from_file(file);
 
-	if (sock) {
-		spin_lock(&cgroup_sk_update_lock);
+	if (sock)
 		sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
-		spin_unlock(&cgroup_sk_update_lock);
-	}
 	if (--ctx->batch == 0) {
 		ctx->batch = UPDATE_CLASSID_BATCH;
 		return n + 1;
@@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 	struct css_task_iter it;
 	struct task_struct *p;
 
-	cgroup_sk_alloc_disable();
-
 	cs->classid = (u32)value;
 
 	css_task_iter_start(css, 0, &it);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 	if (!dev)
 		return -ENODEV;
 
-	cgroup_sk_alloc_disable();
-
 	rtnl_lock();
 
 	ret = netprio_set_prio(of_css(of), dev, prio);
@@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
 static int update_netprio(const void *v, struct file *file, unsigned n)
 {
 	struct socket *sock = sock_from_file(file);
-	if (sock) {
-		spin_lock(&cgroup_sk_update_lock);
+
+	if (sock)
 		sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
 					(unsigned long)v);
-		spin_unlock(&cgroup_sk_update_lock);
-	}
 	return 0;
 }
 
@@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset)
 	struct task_struct *p;
 	struct cgroup_subsys_state *css;
 
-	cgroup_sk_alloc_disable();
-
 	cgroup_taskset_for_each(p, css, tset) {
 		void *v = (void *)(unsigned long)css->id;
 
-- 
cgit v1.2.3-59-g8ed1b