aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/nscommon.c
blob: bdc3c86231d38e0c46e6c9b118a83256bcb332d0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */

#include <linux/ns_common.h>
#include <linux/nstree.h>
#include <linux/proc_ns.h>
#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>

#ifdef CONFIG_DEBUG_VFS
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
{
	switch (ns->ns_type) {
#ifdef CONFIG_CGROUPS
	case CLONE_NEWCGROUP:
		VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
		break;
#endif
#ifdef CONFIG_IPC_NS
	case CLONE_NEWIPC:
		VFS_WARN_ON_ONCE(ops != &ipcns_operations);
		break;
#endif
	case CLONE_NEWNS:
		VFS_WARN_ON_ONCE(ops != &mntns_operations);
		break;
#ifdef CONFIG_NET_NS
	case CLONE_NEWNET:
		VFS_WARN_ON_ONCE(ops != &netns_operations);
		break;
#endif
#ifdef CONFIG_PID_NS
	case CLONE_NEWPID:
		VFS_WARN_ON_ONCE(ops != &pidns_operations);
		break;
#endif
#ifdef CONFIG_TIME_NS
	case CLONE_NEWTIME:
		VFS_WARN_ON_ONCE(ops != &timens_operations);
		break;
#endif
#ifdef CONFIG_USER_NS
	case CLONE_NEWUSER:
		VFS_WARN_ON_ONCE(ops != &userns_operations);
		break;
#endif
#ifdef CONFIG_UTS_NS
	case CLONE_NEWUTS:
		VFS_WARN_ON_ONCE(ops != &utsns_operations);
		break;
#endif
	}
}
#endif

int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
	int ret = 0;

	refcount_set(&ns->__ns_ref, 1);
	ns->stashed = NULL;
	ns->ops = ops;
	ns->ns_id = 0;
	ns->ns_type = ns_type;
	ns_tree_node_init(&ns->ns_tree_node);
	ns_tree_node_init(&ns->ns_unified_node);
	ns_tree_node_init(&ns->ns_owner_node);
	ns_tree_root_init(&ns->ns_owner_root);

#ifdef CONFIG_DEBUG_VFS
	ns_debug(ns, ops);
#endif

	if (inum)
		ns->inum = inum;
	else
		ret = proc_alloc_inum(&ns->inum);
	if (ret)
		return ret;
	/*
	 * Tree ref starts at 0. It's incremented when namespace enters
	 * active use (installed in nsproxy) and decremented when all
	 * active uses are gone. Initial namespaces are always active.
	 */
	if (is_ns_init_inum(ns))
		atomic_set(&ns->__ns_ref_active, 1);
	else
		atomic_set(&ns->__ns_ref_active, 0);
	return 0;
}

void __ns_common_free(struct ns_common *ns)
{
	proc_free_inum(ns->inum);
}

struct ns_common *__must_check ns_owner(struct ns_common *ns)
{
	struct user_namespace *owner;

	if (unlikely(!ns->ops))
		return NULL;
	VFS_WARN_ON_ONCE(!ns->ops->owner);
	owner = ns->ops->owner(ns);
	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
	if (!owner)
		return NULL;
	/* Skip init_user_ns as it's always active */
	if (owner == &init_user_ns)
		return NULL;
	return to_ns_common(owner);
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
 * That single reference is only released once the child namespace's
 * active count itself goes down.
 *
 * A regular namespace tree might look as follow:
 * Legend:
 * + : adding active reference
 * - : dropping active reference
 * x : always active (initial namespace)
 *
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      +
 *                        user_ns1 (2)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   +   +
 *                        user_ns2 (3)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If both net_ns and pid_ns put their last active reference on
 * themselves it will cascade to user_ns1 dropping its own active
 * reference and dropping one active reference on user_ns2:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   -   +
 *                        user_ns2 (2)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * The iteration stops once we reach a namespace that still has active
 * references.
 */
void __ns_ref_active_put(struct ns_common *ns)
{
	/* Initial namespaces are always active. */
	if (is_ns_init_id(ns))
		return;

	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
		return;
	}

	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));

	for (;;) {
		ns = ns_owner(ns);
		if (!ns)
			return;
		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
			return;
		}
	}
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
 * That single reference is only released once the child namespace's
 * active count itself goes down. This makes it possible to efficiently
 * resurrect a namespace tree:
 *
 * A regular namespace tree might look as follow:
 * Legend:
 * + : adding active reference
 * - : dropping active reference
 * x : always active (initial namespace)
 *
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      +
 *                        user_ns1 (2)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   +   +
 *                        user_ns2 (3)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If both net_ns and pid_ns put their last active reference on
 * themselves it will cascade to user_ns1 dropping its own active
 * reference and dropping one active reference on user_ns2:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   -   +
 *                        user_ns2 (2)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * Assume the whole tree is dead but all namespaces are still active:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        -   -   -
 *                        user_ns2 (0)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        -   +   -
 *                        user_ns2 (0)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If net_ns had a zero reference count and we bumped it we also need to
 * take another reference on its owning user namespace. Similarly, if
 * pid_ns had a zero reference count it also needs to take another
 * reference on its owning user namespace. So both net_ns and pid_ns
 * will each have their own reference on the owning user namespace.
 *
 * If the owning user namespace user_ns1 had a zero reference count then
 * it also needs to take another reference on its owning user namespace
 * and so on.
 */
void __ns_ref_active_get(struct ns_common *ns)
{
	int prev;

	/* Initial namespaces are always active. */
	if (is_ns_init_id(ns))
		return;

	/* If we didn't resurrect the namespace we're done. */
	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
	VFS_WARN_ON_ONCE(prev < 0);
	if (likely(prev))
		return;

	/*
	 * We did resurrect it. Walk the ownership hierarchy upwards
	 * until we found an owning user namespace that is active.
	 */
	for (;;) {
		ns = ns_owner(ns);
		if (!ns)
			return;

		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
		VFS_WARN_ON_ONCE(prev < 0);
		if (likely(prev))
			return;
	}
}