aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/testing/selftests/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests/cgroup')
-rw-r--r--tools/testing/selftests/cgroup/.gitignore10
-rw-r--r--tools/testing/selftests/cgroup/Makefile33
-rw-r--r--tools/testing/selftests/cgroup/config6
-rw-r--r--tools/testing/selftests/cgroup/lib/cgroup_util.c (renamed from tools/testing/selftests/cgroup/cgroup_util.c)267
-rw-r--r--tools/testing/selftests/cgroup/lib/include/cgroup_util.h (renamed from tools/testing/selftests/cgroup/cgroup_util.h)29
-rw-r--r--tools/testing/selftests/cgroup/lib/libcgroup.mk19
-rw-r--r--tools/testing/selftests/cgroup/memcg_protection.m89
-rw-r--r--tools/testing/selftests/cgroup/test_core.c172
-rw-r--r--tools/testing/selftests/cgroup/test_cpu.c801
-rw-r--r--tools/testing/selftests/cgroup/test_cpuset.c275
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_prs.sh1193
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_v1_base.sh77
-rwxr-xr-xtools/testing/selftests/cgroup/test_cpuset_v1_hp.sh46
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c61
-rw-r--r--tools/testing/selftests/cgroup/test_hugetlb_memcg.c234
-rw-r--r--tools/testing/selftests/cgroup/test_kill.c297
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c453
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c865
-rw-r--r--tools/testing/selftests/cgroup/test_pids.c178
-rwxr-xr-xtools/testing/selftests/cgroup/test_stress.sh2
-rw-r--r--tools/testing/selftests/cgroup/test_zswap.c635
-rw-r--r--tools/testing/selftests/cgroup/wait_inotify.c87
22 files changed, 5442 insertions, 387 deletions
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
index aa6de65b0838..952e4448bf07 100644
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -1,4 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
-test_memcontrol
test_core
+test_cpu
+test_cpuset
test_freezer
+test_hugetlb_memcg
+test_kill
+test_kmem
+test_memcontrol
+test_pids
+test_zswap
+wait_inotify
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index 967f268fde74..e01584c2189a 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -1,16 +1,35 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -Wall -pthread
-all:
+all: ${HELPER_PROGS}
TEST_FILES := with_stress.sh
-TEST_PROGS := test_stress.sh
-TEST_GEN_PROGS = test_memcontrol
-TEST_GEN_PROGS += test_core
+TEST_PROGS := test_stress.sh test_cpuset_prs.sh test_cpuset_v1_hp.sh
+TEST_GEN_FILES := wait_inotify
+# Keep the lists lexicographically sorted
+TEST_GEN_PROGS = test_core
+TEST_GEN_PROGS += test_cpu
+TEST_GEN_PROGS += test_cpuset
TEST_GEN_PROGS += test_freezer
+TEST_GEN_PROGS += test_hugetlb_memcg
+TEST_GEN_PROGS += test_kill
+TEST_GEN_PROGS += test_kmem
+TEST_GEN_PROGS += test_memcontrol
+TEST_GEN_PROGS += test_pids
+TEST_GEN_PROGS += test_zswap
+
+LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h
include ../lib.mk
+include lib/libcgroup.mk
-$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
-$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
-$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_core: $(LIBCGROUP_O)
+$(OUTPUT)/test_cpu: $(LIBCGROUP_O)
+$(OUTPUT)/test_cpuset: $(LIBCGROUP_O)
+$(OUTPUT)/test_freezer: $(LIBCGROUP_O)
+$(OUTPUT)/test_hugetlb_memcg: $(LIBCGROUP_O)
+$(OUTPUT)/test_kill: $(LIBCGROUP_O)
+$(OUTPUT)/test_kmem: $(LIBCGROUP_O)
+$(OUTPUT)/test_memcontrol: $(LIBCGROUP_O)
+$(OUTPUT)/test_pids: $(LIBCGROUP_O)
+$(OUTPUT)/test_zswap: $(LIBCGROUP_O)
diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config
new file mode 100644
index 000000000000..39f979690dd3
--- /dev/null
+++ b/tools/testing/selftests/cgroup/config
@@ -0,0 +1,6 @@
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_MEMCG=y
+CONFIG_PAGE_COUNTER=y
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index 8a637ca7d73a..8832f3d1cb61 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -5,54 +5,51 @@
#include <errno.h>
#include <fcntl.h>
#include <linux/limits.h>
+#include <poll.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/inotify.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include "cgroup_util.h"
-#include "../clone3/clone3_selftests.h"
+#include "../../clone3/clone3_selftests.h"
-static ssize_t read_text(const char *path, char *buf, size_t max_len)
+/* Returns read len on success, or -errno on failure. */
+ssize_t read_text(const char *path, char *buf, size_t max_len)
{
ssize_t len;
int fd;
fd = open(path, O_RDONLY);
if (fd < 0)
- return fd;
+ return -errno;
len = read(fd, buf, max_len - 1);
- if (len < 0)
- goto out;
- buf[len] = 0;
-out:
+ if (len >= 0)
+ buf[len] = 0;
+
close(fd);
- return len;
+ return len < 0 ? -errno : len;
}
-static ssize_t write_text(const char *path, char *buf, ssize_t len)
+/* Returns written len on success, or -errno on failure. */
+ssize_t write_text(const char *path, char *buf, ssize_t len)
{
int fd;
fd = open(path, O_WRONLY | O_APPEND);
if (fd < 0)
- return fd;
+ return -errno;
len = write(fd, buf, len);
- if (len < 0) {
- close(fd);
- return len;
- }
-
close(fd);
-
- return len;
+ return len < 0 ? -errno : len;
}
char *cg_name(const char *root, const char *name)
@@ -85,16 +82,16 @@ char *cg_control(const char *cgroup, const char *control)
return ret;
}
+/* Returns 0 on success, or -errno on failure. */
int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
{
char path[PATH_MAX];
+ ssize_t ret;
snprintf(path, sizeof(path), "%s/%s", cgroup, control);
- if (read_text(path, buf, len) >= 0)
- return 0;
-
- return -1;
+ ret = read_text(path, buf, len);
+ return ret >= 0 ? 0 : ret;
}
int cg_read_strcmp(const char *cgroup, const char *control,
@@ -106,7 +103,7 @@ int cg_read_strcmp(const char *cgroup, const char *control,
/* Handle the case of comparing against empty string */
if (!expected)
- size = 32;
+ return -1;
else
size = strlen(expected) + 1;
@@ -144,6 +141,16 @@ long cg_read_long(const char *cgroup, const char *control)
return atol(buf);
}
+long cg_read_long_fd(int fd)
+{
+ char buf[128];
+
+ if (pread(fd, buf, sizeof(buf), 0) <= 0)
+ return -1;
+
+ return atol(buf);
+}
+
long cg_read_key_long(const char *cgroup, const char *control, const char *key)
{
char buf[PAGE_SIZE];
@@ -175,23 +182,46 @@ long cg_read_lc(const char *cgroup, const char *control)
return cnt;
}
+/* Returns 0 on success, or -errno on failure. */
int cg_write(const char *cgroup, const char *control, char *buf)
{
char path[PATH_MAX];
- ssize_t len = strlen(buf);
+ ssize_t len = strlen(buf), ret;
snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+ ret = write_text(path, buf, len);
+ return ret == len ? 0 : ret;
+}
- if (write_text(path, buf, len) == len)
- return 0;
+/*
+ * Returns fd on success, or -1 on failure.
+ * (fd should be closed with close() as usual)
+ */
+int cg_open(const char *cgroup, const char *control, int flags)
+{
+ char path[PATH_MAX];
- return -1;
+ snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+ return open(path, flags);
+}
+
+int cg_write_numeric(const char *cgroup, const char *control, long value)
+{
+ char buf[64];
+ int ret;
+
+ ret = sprintf(buf, "%lu", value);
+ if (ret < 0)
+ return ret;
+
+ return cg_write(cgroup, control, buf);
}
-int cg_find_unified_root(char *root, size_t len)
+static int cg_find_root(char *root, size_t len, const char *controller,
+ bool *nsdelegate)
{
char buf[10 * PAGE_SIZE];
- char *fs, *mount, *type;
+ char *fs, *mount, *type, *options;
const char delim[] = "\n\t ";
if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
@@ -204,22 +234,43 @@ int cg_find_unified_root(char *root, size_t len)
for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
mount = strtok(NULL, delim);
type = strtok(NULL, delim);
+ options = strtok(NULL, delim);
strtok(NULL, delim);
strtok(NULL, delim);
- strtok(NULL, delim);
-
- if (strcmp(type, "cgroup2") == 0) {
- strncpy(root, mount, len);
- return 0;
+ if (strcmp(type, "cgroup") == 0) {
+ if (!controller || !strstr(options, controller))
+ continue;
+ } else if (strcmp(type, "cgroup2") == 0) {
+ if (controller &&
+ cg_read_strstr(mount, "cgroup.controllers", controller))
+ continue;
+ } else {
+ continue;
}
+ strncpy(root, mount, len);
+
+ if (nsdelegate)
+ *nsdelegate = !!strstr(options, "nsdelegate");
+ return 0;
+
}
return -1;
}
+int cg_find_controller_root(char *root, size_t len, const char *controller)
+{
+ return cg_find_root(root, len, controller, NULL);
+}
+
+int cg_find_unified_root(char *root, size_t len, bool *nsdelegate)
+{
+ return cg_find_root(root, len, NULL, nsdelegate);
+}
+
int cg_create(const char *cgroup)
{
- return mkdir(cgroup, 0644);
+ return mkdir(cgroup, 0755);
}
int cg_wait_for_proc_count(const char *cgroup, int count)
@@ -252,6 +303,10 @@ int cg_killall(const char *cgroup)
char buf[PAGE_SIZE];
char *ptr = buf;
+ /* If cgroup.kill exists use it. */
+ if (!cg_write(cgroup, "cgroup.kill", "1"))
+ return 0;
+
if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
return -1;
@@ -275,6 +330,8 @@ int cg_destroy(const char *cgroup)
{
int ret;
+ if (!cgroup)
+ return 0;
retry:
ret = rmdir(cgroup);
if (ret && errno == EBUSY) {
@@ -337,13 +394,13 @@ pid_t clone_into_cgroup(int cgroup_fd)
#ifdef CLONE_ARGS_SIZE_VER2
pid_t pid;
- struct clone_args args = {
+ struct __clone_args args = {
.flags = CLONE_INTO_CGROUP,
.exit_signal = SIGCHLD,
.cgroup = cgroup_fd,
};
- pid = sys_clone3(&args, sizeof(struct clone_args));
+ pid = sys_clone3(&args, sizeof(struct __clone_args));
/*
* Verify that this is a genuine test failure:
* ENOSYS -> clone3() not available
@@ -451,87 +508,22 @@ int cg_run_nowait(const char *cgroup,
return pid;
}
-int get_temp_fd(void)
+int proc_mount_contains(const char *option)
{
- return open(".", O_TMPFILE | O_RDWR | O_EXCL);
-}
-
-int alloc_pagecache(int fd, size_t size)
-{
- char buf[PAGE_SIZE];
- struct stat st;
- int i;
+ char buf[4 * PAGE_SIZE];
+ ssize_t read;
- if (fstat(fd, &st))
- goto cleanup;
+ read = read_text("/proc/mounts", buf, sizeof(buf));
+ if (read < 0)
+ return read;
- size += st.st_size;
-
- if (ftruncate(fd, size))
- goto cleanup;
-
- for (i = 0; i < size; i += sizeof(buf))
- read(fd, buf, sizeof(buf));
-
- return 0;
-
-cleanup:
- return -1;
-}
-
-int alloc_anon(const char *cgroup, void *arg)
-{
- size_t size = (unsigned long)arg;
- char *buf, *ptr;
-
- buf = malloc(size);
- for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
- *ptr = 0;
-
- free(buf);
- return 0;
-}
-
-int is_swap_enabled(void)
-{
- char buf[PAGE_SIZE];
- const char delim[] = "\n";
- int cnt = 0;
- char *line;
-
- if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
- return -1;
-
- for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
- cnt++;
-
- return cnt > 1;
-}
-
-int set_oom_adj_score(int pid, int score)
-{
- char path[PATH_MAX];
- int fd, len;
-
- sprintf(path, "/proc/%d/oom_score_adj", pid);
-
- fd = open(path, O_WRONLY | O_APPEND);
- if (fd < 0)
- return fd;
-
- len = dprintf(fd, "%d", score);
- if (len < 0) {
- close(fd);
- return len;
- }
-
- close(fd);
- return 0;
+ return strstr(buf, option) != NULL;
}
ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
{
char path[PATH_MAX];
+ ssize_t ret;
if (!pid)
snprintf(path, sizeof(path), "/proc/%s/%s",
@@ -539,7 +531,8 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t
else
snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
- return read_text(path, buf, size);
+ ret = read_text(path, buf, size);
+ return ret < 0 ? -1 : ret;
}
int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
@@ -576,3 +569,57 @@ int clone_into_cgroup_run_wait(const char *cgroup)
(void)clone_reap(pid, WEXITED);
return 0;
}
+
+static int __prepare_for_wait(const char *cgroup, const char *filename)
+{
+ int fd, ret = -1;
+
+ fd = inotify_init1(0);
+ if (fd == -1)
+ return fd;
+
+ ret = inotify_add_watch(fd, cg_control(cgroup, filename), IN_MODIFY);
+ if (ret == -1) {
+ close(fd);
+ fd = -1;
+ }
+
+ return fd;
+}
+
+int cg_prepare_for_wait(const char *cgroup)
+{
+ return __prepare_for_wait(cgroup, "cgroup.events");
+}
+
+int memcg_prepare_for_wait(const char *cgroup)
+{
+ return __prepare_for_wait(cgroup, "memory.events");
+}
+
+int cg_wait_for(int fd)
+{
+ int ret = -1;
+ struct pollfd fds = {
+ .fd = fd,
+ .events = POLLIN,
+ };
+
+ while (true) {
+ ret = poll(&fds, 1, 10000);
+
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+
+ break;
+ }
+
+ if (ret > 0 && fds.revents & POLLIN) {
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index 5a1305dd1f0b..adb2bc193183 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -2,21 +2,30 @@
#include <stdbool.h>
#include <stdlib.h>
+#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
-
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
#define MB(x) (x << 20)
+#define USEC_PER_SEC 1000000L
+#define NSEC_PER_SEC 1000000000L
+
+#define TEST_UID 65534 /* usually nobody, any !root is fine */
+
/*
* Checks if two given values differ by less than err% of their sum.
*/
static inline int values_close(long a, long b, int err)
{
- return abs(a - b) <= (a + b) / 100 * err;
+ return labs(a - b) <= (a + b) / 100 * err;
}
-extern int cg_find_unified_root(char *root, size_t len);
+extern ssize_t read_text(const char *path, char *buf, size_t max_len);
+extern ssize_t write_text(const char *path, char *buf, ssize_t len);
+
+extern int cg_find_controller_root(char *root, size_t len, const char *controller);
+extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate);
extern char *cg_name(const char *root, const char *name);
extern char *cg_name_indexed(const char *root, const char *name, int index);
extern char *cg_control(const char *cgroup, const char *control);
@@ -29,9 +38,12 @@ extern int cg_read_strcmp(const char *cgroup, const char *control,
extern int cg_read_strstr(const char *cgroup, const char *control,
const char *needle);
extern long cg_read_long(const char *cgroup, const char *control);
+extern long cg_read_long_fd(int fd);
long cg_read_key_long(const char *cgroup, const char *control, const char *key);
extern long cg_read_lc(const char *cgroup, const char *control);
extern int cg_write(const char *cgroup, const char *control, char *buf);
+extern int cg_open(const char *cgroup, const char *control, int flags);
+int cg_write_numeric(const char *cgroup, const char *control, long value);
extern int cg_run(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg);
@@ -41,16 +53,15 @@ extern int cg_enter_current_thread(const char *cgroup);
extern int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg);
-extern int get_temp_fd(void);
-extern int alloc_pagecache(int fd, size_t size);
-extern int alloc_anon(const char *cgroup, void *arg);
-extern int is_swap_enabled(void);
-extern int set_oom_adj_score(int pid, int score);
extern int cg_wait_for_proc_count(const char *cgroup, int count);
extern int cg_killall(const char *cgroup);
+int proc_mount_contains(const char *option);
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
extern pid_t clone_into_cgroup(int cgroup_fd);
extern int clone_reap(pid_t pid, int options);
extern int clone_into_cgroup_run_wait(const char *cgroup);
extern int dirfd_open_opath(const char *dir);
+extern int cg_prepare_for_wait(const char *cgroup);
+extern int memcg_prepare_for_wait(const char *cgroup);
+extern int cg_wait_for(int fd);
diff --git a/tools/testing/selftests/cgroup/lib/libcgroup.mk b/tools/testing/selftests/cgroup/lib/libcgroup.mk
new file mode 100644
index 000000000000..7a73007204c3
--- /dev/null
+++ b/tools/testing/selftests/cgroup/lib/libcgroup.mk
@@ -0,0 +1,19 @@
+CGROUP_DIR := $(selfdir)/cgroup
+
+LIBCGROUP_C := lib/cgroup_util.c
+
+LIBCGROUP_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBCGROUP_C))
+
+LIBCGROUP_O_DIRS := $(shell dirname $(LIBCGROUP_O) | uniq)
+
+CFLAGS += -I$(CGROUP_DIR)/lib/include
+
+EXTRA_HDRS := $(selfdir)/clone3/clone3_selftests.h
+
+$(LIBCGROUP_O_DIRS):
+ mkdir -p $@
+
+$(LIBCGROUP_O): $(OUTPUT)/%.o : $(CGROUP_DIR)/%.c $(EXTRA_HDRS) $(LIBCGROUP_O_DIRS)
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+EXTRA_CLEAN += $(LIBCGROUP_O)
diff --git a/tools/testing/selftests/cgroup/memcg_protection.m b/tools/testing/selftests/cgroup/memcg_protection.m
new file mode 100644
index 000000000000..051daa3477b6
--- /dev/null
+++ b/tools/testing/selftests/cgroup/memcg_protection.m
@@ -0,0 +1,89 @@
+% SPDX-License-Identifier: GPL-2.0
+%
+% run as: octave-cli memcg_protection.m
+%
+% This script simulates reclaim protection behavior on a single level of memcg
+% hierarchy to illustrate how overcommitted protection spreads among siblings
+% (as it depends also on their current consumption).
+%
+% Simulation assumes siblings consumed the initial amount of memory (w/out
+% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated
+% same. It simulates only non-low reclaim and assumes all memory.min = 0.
+%
+% Input configurations
+% --------------------
+% E number parent effective protection
+% n vector nominal protection of siblings set at the given level (memory.low)
+% c vector current consumption -,,- (memory.current)
+
+% example from testcase (values in GB)
+E = 50 / 1024;
+n = [75 25 0 500 ] / 1024;
+c = [50 50 50 0] / 1024;
+
+% Reclaim parameters
+% ------------------
+
+% Minimal reclaim amount (GB)
+cluster = 32*4 / 2**20;
+
+% Reclaim coefficient (think as 0.5^sc->priority)
+alpha = .1
+
+% Simulation parameters
+% ---------------------
+epsilon = 1e-7;
+timeout = 1000;
+
+% Simulation loop
+% ---------------
+
+ch = [];
+eh = [];
+rh = [];
+
+for t = 1:timeout
+ % low_usage
+ u = min(c, n);
+ siblings = sum(u);
+
+ % effective_protection()
+ protected = min(n, c); % start with nominal
+ e = protected * min(1, E / siblings); % normalize overcommit
+
+ % recursive protection
+ unclaimed = max(0, E - siblings);
+ parent_overuse = sum(c) - siblings;
+ if (unclaimed > 0 && parent_overuse > 0)
+ overuse = max(0, c - protected);
+ e += unclaimed * (overuse / parent_overuse);
+ endif
+
+ % get_scan_count()
+ r = alpha * c; % assume all memory is in a single LRU list
+
+ % commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection")
+ sz = max(e, c);
+ r .*= (1 - (e+epsilon) ./ (sz+epsilon));
+
+ % uncomment to debug prints
+ % e, c, r
+
+ % nothing to reclaim, reached equilibrium
+ if max(r) < epsilon
+ break;
+ endif
+
+ % SWAP_CLUSTER_MAX roundup
+ r = max(r, (r > epsilon) .* cluster);
+ % XXX here I do parallel reclaim of all siblings
+ % in reality reclaim is serialized and each sibling recalculates own residual
+ c = max(c - r, 0);
+
+ ch = [ch ; c];
+ eh = [eh ; e];
+ rh = [rh ; r];
+endfor
+
+t
+c, e
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index 3df648c37876..a5672a91d273 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -1,11 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
#include <linux/limits.h>
+#include <linux/sched.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h>
#include <fcntl.h>
+#include <sched.h>
#include <stdio.h>
#include <errno.h>
#include <signal.h>
@@ -15,6 +18,8 @@
#include "../kselftest.h"
#include "cgroup_util.h"
+static bool nsdelegate;
+
static int touch_anon(char *buf, size_t size)
{
int fd;
@@ -674,6 +679,169 @@ cleanup:
return ret;
}
+/*
+ * cgroup migration permission check should be performed based on the
+ * credentials at the time of open instead of write.
+ */
+static int test_cgcore_lesser_euid_open(const char *root)
+{
+ const uid_t test_euid = TEST_UID;
+ int ret = KSFT_FAIL;
+ char *cg_test_a = NULL, *cg_test_b = NULL;
+ char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+ int cg_test_b_procs_fd = -1;
+ uid_t saved_uid;
+
+ cg_test_a = cg_name(root, "cg_test_a");
+ cg_test_b = cg_name(root, "cg_test_b");
+
+ if (!cg_test_a || !cg_test_b)
+ goto cleanup;
+
+ cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+ cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+ if (!cg_test_a_procs || !cg_test_b_procs)
+ goto cleanup;
+
+ if (cg_create(cg_test_a) || cg_create(cg_test_b))
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_a))
+ goto cleanup;
+
+ if (chown(cg_test_a_procs, test_euid, -1) ||
+ chown(cg_test_b_procs, test_euid, -1))
+ goto cleanup;
+
+ saved_uid = geteuid();
+ if (seteuid(test_euid))
+ goto cleanup;
+
+ cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);
+
+ if (seteuid(saved_uid))
+ goto cleanup;
+
+ if (cg_test_b_procs_fd < 0)
+ goto cleanup;
+
+ if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (cg_test_b_procs_fd >= 0)
+ close(cg_test_b_procs_fd);
+ if (cg_test_b)
+ cg_destroy(cg_test_b);
+ if (cg_test_a)
+ cg_destroy(cg_test_a);
+ free(cg_test_b_procs);
+ free(cg_test_a_procs);
+ free(cg_test_b);
+ free(cg_test_a);
+ return ret;
+}
+
+struct lesser_ns_open_thread_arg {
+ const char *path;
+ int fd;
+ int err;
+};
+
+static int lesser_ns_open_thread_fn(void *arg)
+{
+ struct lesser_ns_open_thread_arg *targ = arg;
+
+ targ->fd = open(targ->path, O_RDWR);
+ targ->err = errno;
+ return 0;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the cgroup
+ * namespace at the time of open instead of write.
+ */
+static int test_cgcore_lesser_ns_open(const char *root)
+{
+ static char stack[65536];
+ const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
+ int ret = KSFT_FAIL;
+ char *cg_test_a = NULL, *cg_test_b = NULL;
+ char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+ int cg_test_b_procs_fd = -1;
+ struct lesser_ns_open_thread_arg targ = { .fd = -1 };
+ pid_t pid;
+ int status;
+
+ if (!nsdelegate)
+ return KSFT_SKIP;
+
+ cg_test_a = cg_name(root, "cg_test_a");
+ cg_test_b = cg_name(root, "cg_test_b");
+
+ if (!cg_test_a || !cg_test_b)
+ goto cleanup;
+
+ cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+ cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+ if (!cg_test_a_procs || !cg_test_b_procs)
+ goto cleanup;
+
+ if (cg_create(cg_test_a) || cg_create(cg_test_b))
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_b))
+ goto cleanup;
+
+ if (chown(cg_test_a_procs, test_euid, -1) ||
+ chown(cg_test_b_procs, test_euid, -1))
+ goto cleanup;
+
+ targ.path = cg_test_b_procs;
+ pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
+ CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD,
+ &targ);
+ if (pid < 0)
+ goto cleanup;
+
+ if (waitpid(pid, &status, 0) < 0)
+ goto cleanup;
+
+ if (!WIFEXITED(status))
+ goto cleanup;
+
+ cg_test_b_procs_fd = targ.fd;
+ if (cg_test_b_procs_fd < 0)
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_a))
+ goto cleanup;
+
+ if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (cg_test_b_procs_fd >= 0)
+ close(cg_test_b_procs_fd);
+ if (cg_test_b)
+ cg_destroy(cg_test_b);
+ if (cg_test_a)
+ cg_destroy(cg_test_a);
+ free(cg_test_b_procs);
+ free(cg_test_a_procs);
+ free(cg_test_b);
+ free(cg_test_a);
+ return ret;
+}
+
#define T(x) { x, #x }
struct corecg_test {
int (*fn)(const char *root);
@@ -689,6 +857,8 @@ struct corecg_test {
T(test_cgcore_proc_migration),
T(test_cgcore_thread_migration),
T(test_cgcore_destroy),
+ T(test_cgcore_lesser_euid_open),
+ T(test_cgcore_lesser_ns_open),
};
#undef T
@@ -697,7 +867,7 @@ int main(int argc, char *argv[])
char root[PATH_MAX];
int i, ret = EXIT_SUCCESS;
- if (cg_find_unified_root(root, sizeof(root)))
+ if (cg_find_unified_root(root, sizeof(root), &nsdelegate))
ksft_exit_skip("cgroup v2 isn't mounted\n");
if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
new file mode 100644
index 000000000000..a2b50af8e9ee
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -0,0 +1,801 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <linux/limits.h>
+#include <sys/sysinfo.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+enum hog_clock_type {
+ // Count elapsed time using the CLOCK_PROCESS_CPUTIME_ID clock.
+ CPU_HOG_CLOCK_PROCESS,
+ // Count elapsed time using system wallclock time.
+ CPU_HOG_CLOCK_WALL,
+};
+
+struct cpu_hogger {
+ char *cgroup;
+ pid_t pid;
+ long usage;
+};
+
+struct cpu_hog_func_param {
+ int nprocs;
+ struct timespec ts;
+ enum hog_clock_type clock_type;
+};
+
+/*
+ * This test creates two nested cgroups with and without enabling
+ * the cpu controller.
+ */
+static int test_cpucg_subtree_control(const char *root)
+{
+ char *parent = NULL, *child = NULL, *parent2 = NULL, *child2 = NULL;
+ int ret = KSFT_FAIL;
+
+ // Create two nested cgroups with the cpu controller enabled.
+ parent = cg_name(root, "cpucg_test_0");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+ goto cleanup;
+
+ child = cg_name(parent, "cpucg_test_child");
+ if (!child)
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_read_strstr(child, "cgroup.controllers", "cpu"))
+ goto cleanup;
+
+ // Create two nested cgroups without enabling the cpu controller.
+ parent2 = cg_name(root, "cpucg_test_1");
+ if (!parent2)
+ goto cleanup;
+
+ if (cg_create(parent2))
+ goto cleanup;
+
+ child2 = cg_name(parent2, "cpucg_test_child");
+ if (!child2)
+ goto cleanup;
+
+ if (cg_create(child2))
+ goto cleanup;
+
+ if (!cg_read_strstr(child2, "cgroup.controllers", "cpu"))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(child);
+ free(child);
+ cg_destroy(child2);
+ free(child2);
+ cg_destroy(parent);
+ free(parent);
+ cg_destroy(parent2);
+ free(parent2);
+
+ return ret;
+}
+
+static void *hog_cpu_thread_func(void *arg)
+{
+ while (1)
+ ;
+
+ return NULL;
+}
+
+static struct timespec
+timespec_sub(const struct timespec *lhs, const struct timespec *rhs)
+{
+ struct timespec zero = {
+ .tv_sec = 0,
+ .tv_nsec = 0,
+ };
+ struct timespec ret;
+
+ if (lhs->tv_sec < rhs->tv_sec)
+ return zero;
+
+ ret.tv_sec = lhs->tv_sec - rhs->tv_sec;
+
+ if (lhs->tv_nsec < rhs->tv_nsec) {
+ if (ret.tv_sec == 0)
+ return zero;
+
+ ret.tv_sec--;
+ ret.tv_nsec = NSEC_PER_SEC - rhs->tv_nsec + lhs->tv_nsec;
+ } else
+ ret.tv_nsec = lhs->tv_nsec - rhs->tv_nsec;
+
+ return ret;
+}
+
+static int hog_cpus_timed(const char *cgroup, void *arg)
+{
+ const struct cpu_hog_func_param *param =
+ (struct cpu_hog_func_param *)arg;
+ struct timespec ts_run = param->ts;
+ struct timespec ts_remaining = ts_run;
+ struct timespec ts_start;
+ int i, ret;
+
+ ret = clock_gettime(CLOCK_MONOTONIC, &ts_start);
+ if (ret != 0)
+ return ret;
+
+ for (i = 0; i < param->nprocs; i++) {
+ pthread_t tid;
+
+ ret = pthread_create(&tid, NULL, &hog_cpu_thread_func, NULL);
+ if (ret != 0)
+ return ret;
+ }
+
+ while (ts_remaining.tv_sec > 0 || ts_remaining.tv_nsec > 0) {
+ struct timespec ts_total;
+
+ ret = nanosleep(&ts_remaining, NULL);
+ if (ret && errno != EINTR)
+ return ret;
+
+ if (param->clock_type == CPU_HOG_CLOCK_PROCESS) {
+ ret = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts_total);
+ if (ret != 0)
+ return ret;
+ } else {
+ struct timespec ts_current;
+
+ ret = clock_gettime(CLOCK_MONOTONIC, &ts_current);
+ if (ret != 0)
+ return ret;
+
+ ts_total = timespec_sub(&ts_current, &ts_start);
+ }
+
+ ts_remaining = timespec_sub(&ts_run, &ts_total);
+ }
+
+ return 0;
+}
+
+/*
+ * Creates a cpu cgroup, burns a CPU for a few quanta, and verifies that
+ * cpu.stat shows the expected output.
+ */
+static int test_cpucg_stats(const char *root)
+{
+ int ret = KSFT_FAIL;
+ long usage_usec, user_usec, system_usec;
+ long usage_seconds = 2;
+ long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+ char *cpucg;
+
+ cpucg = cg_name(root, "cpucg_test");
+ if (!cpucg)
+ goto cleanup;
+
+ if (cg_create(cpucg))
+ goto cleanup;
+
+ usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+ user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+ system_usec = cg_read_key_long(cpucg, "cpu.stat", "system_usec");
+ if (usage_usec != 0 || user_usec != 0 || system_usec != 0)
+ goto cleanup;
+
+ struct cpu_hog_func_param param = {
+ .nprocs = 1,
+ .ts = {
+ .tv_sec = usage_seconds,
+ .tv_nsec = 0,
+ },
+ .clock_type = CPU_HOG_CLOCK_PROCESS,
+ };
+ if (cg_run(cpucg, hog_cpus_timed, (void *)&param))
+ goto cleanup;
+
+ usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+ user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+ if (user_usec <= 0)
+ goto cleanup;
+
+ if (!values_close(usage_usec, expected_usage_usec, 1))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(cpucg);
+ free(cpucg);
+
+ return ret;
+}
+
+/*
+ * Creates a nice process that consumes CPU and checks that the elapsed
+ * usertime in the cgroup is close to the expected time.
+ */
+static int test_cpucg_nice(const char *root)
+{
+ int ret = KSFT_FAIL;
+ int status;
+ long user_usec, nice_usec;
+ long usage_seconds = 2;
+ long expected_nice_usec = usage_seconds * USEC_PER_SEC;
+ char *cpucg;
+ pid_t pid;
+
+ cpucg = cg_name(root, "cpucg_test");
+ if (!cpucg)
+ goto cleanup;
+
+ if (cg_create(cpucg))
+ goto cleanup;
+
+ user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+ nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
+ if (nice_usec == -1)
+ ret = KSFT_SKIP;
+ if (user_usec != 0 || nice_usec != 0)
+ goto cleanup;
+
+ /*
+ * We fork here to create a new process that can be niced without
+ * polluting the nice value of other selftests
+ */
+ pid = fork();
+ if (pid < 0) {
+ goto cleanup;
+ } else if (pid == 0) {
+ struct cpu_hog_func_param param = {
+ .nprocs = 1,
+ .ts = {
+ .tv_sec = usage_seconds,
+ .tv_nsec = 0,
+ },
+ .clock_type = CPU_HOG_CLOCK_PROCESS,
+ };
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%d", getpid());
+ if (cg_write(cpucg, "cgroup.procs", buf))
+ goto cleanup;
+
+ /* Try to keep niced CPU usage as constrained to hog_cpu as possible */
+ nice(1);
+ hog_cpus_timed(cpucg, &param);
+ exit(0);
+ } else {
+ waitpid(pid, &status, 0);
+ if (!WIFEXITED(status))
+ goto cleanup;
+
+ user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+ nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
+ if (!values_close(nice_usec, expected_nice_usec, 1))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+ }
+
+cleanup:
+ cg_destroy(cpucg);
+ free(cpucg);
+
+ return ret;
+}
+
+static int
+run_cpucg_weight_test(
+ const char *root,
+ pid_t (*spawn_child)(const struct cpu_hogger *child),
+ int (*validate)(const struct cpu_hogger *children, int num_children))
+{
+ int ret = KSFT_FAIL, i;
+ char *parent = NULL;
+ struct cpu_hogger children[3] = {};
+
+ parent = cg_name(root, "cpucg_test_0");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ children[i].cgroup = cg_name_indexed(parent, "cpucg_child", i);
+ if (!children[i].cgroup)
+ goto cleanup;
+
+ if (cg_create(children[i].cgroup))
+ goto cleanup;
+
+ if (cg_write_numeric(children[i].cgroup, "cpu.weight",
+ 50 * (i + 1)))
+ goto cleanup;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ pid_t pid = spawn_child(&children[i]);
+ if (pid <= 0)
+ goto cleanup;
+ children[i].pid = pid;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ int retcode;
+
+ waitpid(children[i].pid, &retcode, 0);
+ if (!WIFEXITED(retcode))
+ goto cleanup;
+ if (WEXITSTATUS(retcode))
+ goto cleanup;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(children); i++)
+ children[i].usage = cg_read_key_long(children[i].cgroup,
+ "cpu.stat", "usage_usec");
+
+ if (validate(children, ARRAY_SIZE(children)))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+cleanup:
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ cg_destroy(children[i].cgroup);
+ free(children[i].cgroup);
+ }
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+static pid_t weight_hog_ncpus(const struct cpu_hogger *child, int ncpus)
+{
+ long usage_seconds = 10;
+ struct cpu_hog_func_param param = {
+ .nprocs = ncpus,
+ .ts = {
+ .tv_sec = usage_seconds,
+ .tv_nsec = 0,
+ },
+ .clock_type = CPU_HOG_CLOCK_WALL,
+ };
+ return cg_run_nowait(child->cgroup, hog_cpus_timed, (void *)&param);
+}
+
+static pid_t weight_hog_all_cpus(const struct cpu_hogger *child)
+{
+ return weight_hog_ncpus(child, get_nprocs());
+}
+
+static int
+overprovision_validate(const struct cpu_hogger *children, int num_children)
+{
+ int ret = KSFT_FAIL, i;
+
+ for (i = 0; i < num_children - 1; i++) {
+ long delta;
+
+ if (children[i + 1].usage <= children[i].usage)
+ goto cleanup;
+
+ delta = children[i + 1].usage - children[i].usage;
+ if (!values_close(delta, children[0].usage, 35))
+ goto cleanup;
+ }
+
+ ret = KSFT_PASS;
+cleanup:
+ return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B cpu.weight = 50
+ * A/C cpu.weight = 100
+ * A/D cpu.weight = 150
+ *
+ * A separate process is then created for each child cgroup which spawns as
+ * many threads as there are cores, and hogs each CPU as much as possible
+ * for some time interval.
+ *
+ * Once all of the children have exited, we verify that each child cgroup
+ * was given proportional runtime as informed by their cpu.weight.
+ */
+static int test_cpucg_weight_overprovisioned(const char *root)
+{
+ return run_cpucg_weight_test(root, weight_hog_all_cpus,
+ overprovision_validate);
+}
+
+static pid_t weight_hog_one_cpu(const struct cpu_hogger *child)
+{
+ return weight_hog_ncpus(child, 1);
+}
+
+static int
+underprovision_validate(const struct cpu_hogger *children, int num_children)
+{
+ int ret = KSFT_FAIL, i;
+
+ for (i = 0; i < num_children - 1; i++) {
+ if (!values_close(children[i + 1].usage, children[0].usage, 15))
+ goto cleanup;
+ }
+
+ ret = KSFT_PASS;
+cleanup:
+ return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B cpu.weight = 50
+ * A/C cpu.weight = 100
+ * A/D cpu.weight = 150
+ *
+ * A separate process is then created for each child cgroup which spawns a
+ * single thread that hogs a CPU. The testcase is only run on systems that
+ * have at least one core per-thread in the child processes.
+ *
+ * Once all of the children have exited, we verify that each child cgroup
+ * had roughly the same runtime despite having different cpu.weight.
+ */
+static int test_cpucg_weight_underprovisioned(const char *root)
+{
+ // Only run the test if there are enough cores to avoid overprovisioning
+ // the system.
+ if (get_nprocs() < 4)
+ return KSFT_SKIP;
+
+ return run_cpucg_weight_test(root, weight_hog_one_cpu,
+ underprovision_validate);
+}
+
+static int
+run_cpucg_nested_weight_test(const char *root, bool overprovisioned)
+{
+ int ret = KSFT_FAIL, i;
+ char *parent = NULL, *child = NULL;
+ struct cpu_hogger leaf[3] = {};
+ long nested_leaf_usage, child_usage;
+ int nprocs = get_nprocs();
+
+ if (!overprovisioned) {
+ if (nprocs < 4)
+ /*
+ * Only run the test if there are enough cores to avoid overprovisioning
+ * the system.
+ */
+ return KSFT_SKIP;
+ nprocs /= 4;
+ }
+
+ parent = cg_name(root, "cpucg_test");
+ child = cg_name(parent, "cpucg_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+ if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+ if (cg_write(child, "cgroup.subtree_control", "+cpu"))
+ goto cleanup;
+ if (cg_write(child, "cpu.weight", "1000"))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+ const char *ancestor;
+ long weight;
+
+ if (i == 0) {
+ ancestor = parent;
+ weight = 1000;
+ } else {
+ ancestor = child;
+ weight = 5000;
+ }
+ leaf[i].cgroup = cg_name_indexed(ancestor, "cpucg_leaf", i);
+ if (!leaf[i].cgroup)
+ goto cleanup;
+
+ if (cg_create(leaf[i].cgroup))
+ goto cleanup;
+
+ if (cg_write_numeric(leaf[i].cgroup, "cpu.weight", weight))
+ goto cleanup;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+ pid_t pid;
+ struct cpu_hog_func_param param = {
+ .nprocs = nprocs,
+ .ts = {
+ .tv_sec = 10,
+ .tv_nsec = 0,
+ },
+ .clock_type = CPU_HOG_CLOCK_WALL,
+ };
+
+ pid = cg_run_nowait(leaf[i].cgroup, hog_cpus_timed,
+ (void *)&param);
+ if (pid <= 0)
+ goto cleanup;
+ leaf[i].pid = pid;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+ int retcode;
+
+ waitpid(leaf[i].pid, &retcode, 0);
+ if (!WIFEXITED(retcode))
+ goto cleanup;
+ if (WEXITSTATUS(retcode))
+ goto cleanup;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+ leaf[i].usage = cg_read_key_long(leaf[i].cgroup,
+ "cpu.stat", "usage_usec");
+ if (leaf[i].usage <= 0)
+ goto cleanup;
+ }
+
+ nested_leaf_usage = leaf[1].usage + leaf[2].usage;
+ if (overprovisioned) {
+ if (!values_close(leaf[0].usage, nested_leaf_usage, 15))
+ goto cleanup;
+ } else if (!values_close(leaf[0].usage * 2, nested_leaf_usage, 15))
+ goto cleanup;
+
+
+ child_usage = cg_read_key_long(child, "cpu.stat", "usage_usec");
+ if (child_usage <= 0)
+ goto cleanup;
+ if (!values_close(child_usage, nested_leaf_usage, 1))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+cleanup:
+ for (i = 0; i < ARRAY_SIZE(leaf); i++) {
+ cg_destroy(leaf[i].cgroup);
+ free(leaf[i].cgroup);
+ }
+ cg_destroy(child);
+ free(child);
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B cpu.weight = 1000
+ * A/C cpu.weight = 1000
+ * A/C/D cpu.weight = 5000
+ * A/C/E cpu.weight = 5000
+ *
+ * A separate process is then created for each leaf, which spawn nproc threads
+ * that burn a CPU for a few seconds.
+ *
+ * Once all of those processes have exited, we verify that each of the leaf
+ * cgroups have roughly the same usage from cpu.stat.
+ */
+static int
+test_cpucg_nested_weight_overprovisioned(const char *root)
+{
+ return run_cpucg_nested_weight_test(root, true);
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A
+ * A/B cpu.weight = 1000
+ * A/C cpu.weight = 1000
+ * A/C/D cpu.weight = 5000
+ * A/C/E cpu.weight = 5000
+ *
+ * A separate process is then created for each leaf, which nproc / 4 threads
+ * that burns a CPU for a few seconds.
+ *
+ * Once all of those processes have exited, we verify that each of the leaf
+ * cgroups have roughly the same usage from cpu.stat.
+ */
+static int
+test_cpucg_nested_weight_underprovisioned(const char *root)
+{
+ return run_cpucg_nested_weight_test(root, false);
+}
+
+/*
+ * This test creates a cgroup with some maximum value within a period, and
+ * verifies that a process in the cgroup is not overscheduled.
+ */
+static int test_cpucg_max(const char *root)
+{
+ int ret = KSFT_FAIL;
+ long usage_usec, user_usec;
+ long usage_seconds = 1;
+ long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+ char *cpucg;
+
+ cpucg = cg_name(root, "cpucg_test");
+ if (!cpucg)
+ goto cleanup;
+
+ if (cg_create(cpucg))
+ goto cleanup;
+
+ if (cg_write(cpucg, "cpu.max", "1000"))
+ goto cleanup;
+
+ struct cpu_hog_func_param param = {
+ .nprocs = 1,
+ .ts = {
+ .tv_sec = usage_seconds,
+ .tv_nsec = 0,
+ },
+ .clock_type = CPU_HOG_CLOCK_WALL,
+ };
+ if (cg_run(cpucg, hog_cpus_timed, (void *)&param))
+ goto cleanup;
+
+ usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+ user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
+ if (user_usec <= 0)
+ goto cleanup;
+
+ if (user_usec >= expected_usage_usec)
+ goto cleanup;
+
+ if (values_close(usage_usec, expected_usage_usec, 95))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(cpucg);
+ free(cpucg);
+
+ return ret;
+}
+
+/*
+ * This test verifies that a process inside of a nested cgroup whose parent
+ * group has a cpu.max value set, is properly throttled.
+ */
+static int test_cpucg_max_nested(const char *root)
+{
+ int ret = KSFT_FAIL;
+ long usage_usec, user_usec;
+ long usage_seconds = 1;
+ long expected_usage_usec = usage_seconds * USEC_PER_SEC;
+ char *parent, *child;
+
+ parent = cg_name(root, "cpucg_parent");
+ child = cg_name(parent, "cpucg_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "cpu.max", "1000"))
+ goto cleanup;
+
+ struct cpu_hog_func_param param = {
+ .nprocs = 1,
+ .ts = {
+ .tv_sec = usage_seconds,
+ .tv_nsec = 0,
+ },
+ .clock_type = CPU_HOG_CLOCK_WALL,
+ };
+ if (cg_run(child, hog_cpus_timed, (void *)&param))
+ goto cleanup;
+
+ usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec");
+ user_usec = cg_read_key_long(child, "cpu.stat", "user_usec");
+ if (user_usec <= 0)
+ goto cleanup;
+
+ if (user_usec >= expected_usage_usec)
+ goto cleanup;
+
+ if (values_close(usage_usec, expected_usage_usec, 95))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(child);
+ free(child);
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct cpucg_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_cpucg_subtree_control),
+ T(test_cpucg_stats),
+ T(test_cpucg_nice),
+ T(test_cpucg_weight_overprovisioned),
+ T(test_cpucg_weight_underprovisioned),
+ T(test_cpucg_nested_weight_overprovisioned),
+ T(test_cpucg_nested_weight_underprovisioned),
+ T(test_cpucg_max),
+ T(test_cpucg_max_nested),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "cpu"))
+ if (cg_write(root, "cgroup.subtree_control", "+cpu"))
+ ksft_exit_skip("Failed to set cpu controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c
new file mode 100644
index 000000000000..4034d14ba69a
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/limits.h>
+#include <signal.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int idle_process_fn(const char *cgroup, void *arg)
+{
+ (void)pause();
+ return 0;
+}
+
+static int do_migration_fn(const char *cgroup, void *arg)
+{
+ int object_pid = (int)(size_t)arg;
+
+ if (setuid(TEST_UID))
+ return EXIT_FAILURE;
+
+ // XXX checking /proc/$pid/cgroup would be quicker than wait
+ if (cg_enter(cgroup, object_pid) ||
+ cg_wait_for_proc_count(cgroup, 1))
+ return EXIT_FAILURE;
+
+ return EXIT_SUCCESS;
+}
+
+static int do_controller_fn(const char *cgroup, void *arg)
+{
+ const char *child = cgroup;
+ const char *parent = arg;
+
+ if (setuid(TEST_UID))
+ return EXIT_FAILURE;
+
+ if (!cg_read_strstr(child, "cgroup.controllers", "cpuset"))
+ return EXIT_FAILURE;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+cpuset"))
+ return EXIT_FAILURE;
+
+ if (cg_read_strstr(child, "cgroup.controllers", "cpuset"))
+ return EXIT_FAILURE;
+
+ if (cg_write(parent, "cgroup.subtree_control", "-cpuset"))
+ return EXIT_FAILURE;
+
+ if (!cg_read_strstr(child, "cgroup.controllers", "cpuset"))
+ return EXIT_FAILURE;
+
+ return EXIT_SUCCESS;
+}
+
+/*
+ * Migrate a process between two sibling cgroups.
+ * The success should only depend on the parent cgroup permissions and not the
+ * migrated process itself (cpuset controller is in place because it uses
+ * security_task_setscheduler() in cgroup v1).
+ *
+ * Deliberately don't set cpuset.cpus in children to avoid definining migration
+ * permissions between two different cpusets.
+ */
+static int test_cpuset_perms_object(const char *root, bool allow)
+{
+ char *parent = NULL, *child_src = NULL, *child_dst = NULL;
+ char *parent_procs = NULL, *child_src_procs = NULL, *child_dst_procs = NULL;
+ const uid_t test_euid = TEST_UID;
+ int object_pid = 0;
+ int ret = KSFT_FAIL;
+
+ parent = cg_name(root, "cpuset_test_0");
+ if (!parent)
+ goto cleanup;
+ parent_procs = cg_name(parent, "cgroup.procs");
+ if (!parent_procs)
+ goto cleanup;
+ if (cg_create(parent))
+ goto cleanup;
+
+ child_src = cg_name(parent, "cpuset_test_1");
+ if (!child_src)
+ goto cleanup;
+ child_src_procs = cg_name(child_src, "cgroup.procs");
+ if (!child_src_procs)
+ goto cleanup;
+ if (cg_create(child_src))
+ goto cleanup;
+
+ child_dst = cg_name(parent, "cpuset_test_2");
+ if (!child_dst)
+ goto cleanup;
+ child_dst_procs = cg_name(child_dst, "cgroup.procs");
+ if (!child_dst_procs)
+ goto cleanup;
+ if (cg_create(child_dst))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+cpuset"))
+ goto cleanup;
+
+ if (cg_read_strstr(child_src, "cgroup.controllers", "cpuset") ||
+ cg_read_strstr(child_dst, "cgroup.controllers", "cpuset"))
+ goto cleanup;
+
+ /* Enable permissions along src->dst tree path */
+ if (chown(child_src_procs, test_euid, -1) ||
+ chown(child_dst_procs, test_euid, -1))
+ goto cleanup;
+
+ if (allow && chown(parent_procs, test_euid, -1))
+ goto cleanup;
+
+ /* Fork a privileged child as a test object */
+ object_pid = cg_run_nowait(child_src, idle_process_fn, NULL);
+ if (object_pid < 0)
+ goto cleanup;
+
+ /* Carry out migration in a child process that can drop all privileges
+ * (including capabilities), the main process must remain privileged for
+ * cleanup.
+ * Child process's cgroup is irrelevant but we place it into child_dst
+ * as hacky way to pass information about migration target to the child.
+ */
+ if (allow ^ (cg_run(child_dst, do_migration_fn, (void *)(size_t)object_pid) == EXIT_SUCCESS))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (object_pid > 0) {
+ (void)kill(object_pid, SIGTERM);
+ (void)clone_reap(object_pid, WEXITED);
+ }
+
+ cg_destroy(child_dst);
+ free(child_dst_procs);
+ free(child_dst);
+
+ cg_destroy(child_src);
+ free(child_src_procs);
+ free(child_src);
+
+ cg_destroy(parent);
+ free(parent_procs);
+ free(parent);
+
+ return ret;
+}
+
+static int test_cpuset_perms_object_allow(const char *root)
+{
+ return test_cpuset_perms_object(root, true);
+}
+
+static int test_cpuset_perms_object_deny(const char *root)
+{
+ return test_cpuset_perms_object(root, false);
+}
+
+/*
+ * Migrate a process between parent and child implicitely
+ * Implicit migration happens when a controller is enabled/disabled.
+ *
+ */
+static int test_cpuset_perms_subtree(const char *root)
+{
+ char *parent = NULL, *child = NULL;
+ char *parent_procs = NULL, *parent_subctl = NULL, *child_procs = NULL;
+ const uid_t test_euid = TEST_UID;
+ int object_pid = 0;
+ int ret = KSFT_FAIL;
+
+ parent = cg_name(root, "cpuset_test_0");
+ if (!parent)
+ goto cleanup;
+ parent_procs = cg_name(parent, "cgroup.procs");
+ if (!parent_procs)
+ goto cleanup;
+ parent_subctl = cg_name(parent, "cgroup.subtree_control");
+ if (!parent_subctl)
+ goto cleanup;
+ if (cg_create(parent))
+ goto cleanup;
+
+ child = cg_name(parent, "cpuset_test_1");
+ if (!child)
+ goto cleanup;
+ child_procs = cg_name(child, "cgroup.procs");
+ if (!child_procs)
+ goto cleanup;
+ if (cg_create(child))
+ goto cleanup;
+
+ /* Enable permissions as in a delegated subtree */
+ if (chown(parent_procs, test_euid, -1) ||
+ chown(parent_subctl, test_euid, -1) ||
+ chown(child_procs, test_euid, -1))
+ goto cleanup;
+
+ /* Put a privileged child in the subtree and modify controller state
+ * from an unprivileged process, the main process remains privileged
+ * for cleanup.
+ * The unprivileged child runs in subtree too to avoid parent and
+ * internal-node constraing violation.
+ */
+ object_pid = cg_run_nowait(child, idle_process_fn, NULL);
+ if (object_pid < 0)
+ goto cleanup;
+
+ if (cg_run(child, do_controller_fn, parent) != EXIT_SUCCESS)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (object_pid > 0) {
+ (void)kill(object_pid, SIGTERM);
+ (void)clone_reap(object_pid, WEXITED);
+ }
+
+ cg_destroy(child);
+ free(child_procs);
+ free(child);
+
+ cg_destroy(parent);
+ free(parent_subctl);
+ free(parent_procs);
+ free(parent);
+
+ return ret;
+}
+
+
+#define T(x) { x, #x }
+struct cpuset_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_cpuset_perms_object_allow),
+ T(test_cpuset_perms_object_deny),
+ T(test_cpuset_perms_subtree),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset"))
+ if (cg_write(root, "cgroup.subtree_control", "+cpuset"))
+ ksft_exit_skip("Failed to set cpuset controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
new file mode 100755
index 000000000000..a17256d9f88a
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -0,0 +1,1193 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test for cpuset v2 partition root state (PRS)
+#
+# The sched verbose flag can be optionally set so that the console log
+# can be examined for the correct setting of scheduling domain.
+#
+
+skip_test() {
+ echo "$1"
+ echo "Test SKIPPED"
+ exit 4 # ksft_skip
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+
+# Get wait_inotify location
+WAIT_INOTIFY=$(cd $(dirname $0); pwd)/wait_inotify
+
+# Find cgroup v2 mount point
+CGROUP2=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+[[ -n "$CGROUP2" ]] || skip_test "Cgroup v2 mount point not found!"
+SUBPARTS_CPUS=$CGROUP2/.__DEBUG__.cpuset.cpus.subpartitions
+CPULIST=$(cat $CGROUP2/cpuset.cpus.effective)
+
+NR_CPUS=$(lscpu | grep "^CPU(s):" | sed -e "s/.*:[[:space:]]*//")
+[[ $NR_CPUS -lt 8 ]] && skip_test "Test needs at least 8 cpus available!"
+
+# Check to see if /dev/console exists and is writable
+if [[ -c /dev/console && -w /dev/console ]]
+then
+ CONSOLE=/dev/console
+else
+ CONSOLE=/dev/null
+fi
+
+# Set verbose flag and delay factor
+PROG=$1
+VERBOSE=0
+DELAY_FACTOR=1
+SCHED_DEBUG=
+while [[ "$1" = -* ]]
+do
+ case "$1" in
+ -v) ((VERBOSE++))
+ # Enable sched/verbose can slow thing down
+ [[ $DELAY_FACTOR -eq 1 ]] &&
+ DELAY_FACTOR=2
+ ;;
+ -d) DELAY_FACTOR=$2
+ shift
+ ;;
+ *) echo "Usage: $PROG [-v] [-d <delay-factor>"
+ exit
+ ;;
+ esac
+ shift
+done
+
+# Set sched verbose flag if available when "-v" option is specified
+if [[ $VERBOSE -gt 0 && -d /sys/kernel/debug/sched ]]
+then
+ # Used to restore the original setting during cleanup
+ SCHED_DEBUG=$(cat /sys/kernel/debug/sched/verbose)
+ echo Y > /sys/kernel/debug/sched/verbose
+fi
+
+cd $CGROUP2
+echo +cpuset > cgroup.subtree_control
+
+#
+# If cpuset has been set up and used in child cgroups, we may not be able to
+# create partition under root cgroup because of the CPU exclusivity rule.
+# So we are going to skip the test if this is the case.
+#
+[[ -d test ]] || mkdir test
+echo 0-6 > test/cpuset.cpus
+echo root > test/cpuset.cpus.partition
+cat test/cpuset.cpus.partition | grep -q invalid
+RESULT=$?
+echo member > test/cpuset.cpus.partition
+echo "" > test/cpuset.cpus
+[[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!"
+
+#
+# If isolated CPUs have been reserved at boot time (as shown in
+# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-8
+# that will be used by this script for testing purpose. If not, some of
+# the tests may fail incorrectly. Wait a bit and retry again just in case
+# these isolated CPUs are leftover from previous run and have just been
+# cleaned up earlier in this script.
+#
+# These pre-isolated CPUs should stay in an isolated state throughout the
+# testing process for now.
+#
+BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated)
+[[ -n "$BOOT_ISOLCPUS" ]] && {
+ sleep 0.5
+ BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated)
+}
+if [[ -n "$BOOT_ISOLCPUS" ]]
+then
+ [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 8 ]] &&
+ skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested"
+ echo "Pre-isolated CPUs: $BOOT_ISOLCPUS"
+fi
+
+cleanup()
+{
+ online_cpus
+ cd $CGROUP2
+ rmdir A1/A2/A3 A1/A2 A1 B1 test/A1 test/B1 test > /dev/null 2>&1
+ rmdir rtest/p1/c11 rtest/p1/c12 rtest/p2/c21 \
+ rtest/p2/c22 rtest/p1 rtest/p2 rtest > /dev/null 2>&1
+ [[ -n "$SCHED_DEBUG" ]] &&
+ echo "$SCHED_DEBUG" > /sys/kernel/debug/sched/verbose
+}
+
+# Pause in ms
+pause()
+{
+ DELAY=$1
+ LOOP=0
+ while [[ $LOOP -lt $DELAY_FACTOR ]]
+ do
+ sleep $DELAY
+ ((LOOP++))
+ done
+ return 0
+}
+
+console_msg()
+{
+ MSG=$1
+ echo "$MSG"
+ echo "" > $CONSOLE
+ echo "$MSG" > $CONSOLE
+ pause 0.01
+}
+
+test_partition()
+{
+ EXPECTED_VAL=$1
+ echo $EXPECTED_VAL > cpuset.cpus.partition
+ [[ $? -eq 0 ]] || exit 1
+ ACTUAL_VAL=$(cat cpuset.cpus.partition)
+ [[ $ACTUAL_VAL != $EXPECTED_VAL ]] && {
+ echo "cpuset.cpus.partition: expect $EXPECTED_VAL, found $ACTUAL_VAL"
+ echo "Test FAILED"
+ exit 1
+ }
+}
+
+test_effective_cpus()
+{
+ EXPECTED_VAL=$1
+ ACTUAL_VAL=$(cat cpuset.cpus.effective)
+ [[ "$ACTUAL_VAL" != "$EXPECTED_VAL" ]] && {
+ echo "cpuset.cpus.effective: expect '$EXPECTED_VAL', found '$ACTUAL_VAL'"
+ echo "Test FAILED"
+ exit 1
+ }
+}
+
+# Adding current process to cgroup.procs as a test
+test_add_proc()
+{
+ OUTSTR="$1"
+ ERRMSG=$((echo $$ > cgroup.procs) |& cat)
+ echo $ERRMSG | grep -q "$OUTSTR"
+ [[ $? -ne 0 ]] && {
+ echo "cgroup.procs: expect '$OUTSTR', got '$ERRMSG'"
+ echo "Test FAILED"
+ exit 1
+ }
+ echo $$ > $CGROUP2/cgroup.procs # Move out the task
+}
+
+#
+# Cpuset controller state transition test matrix.
+#
+# Cgroup test hierarchy
+#
+# root
+# |
+# +------+------+
+# | |
+# A1 B1
+# |
+# A2
+# |
+# A3
+#
+# P<v> = set cpus.partition (0:member, 1:root, 2:isolated)
+# C<l> = add cpu-list to cpuset.cpus
+# X<l> = add cpu-list to cpuset.cpus.exclusive
+# S<p> = use prefix in subtree_control
+# T = put a task into cgroup
+# CX<l> = add cpu-list to both cpuset.cpus and cpuset.cpus.exclusive
+# O<c>=<v> = Write <v> to CPU online file of <c>
+#
+# ECPUs - effective CPUs of cpusets
+# Pstate - partition root state
+# ISOLCPUS - isolated CPUs (<icpus>[,<icpus2>])
+#
+# Note that if there are 2 fields in ISOLCPUS, the first one is for
+# sched-debug matching which includes offline CPUs and single-CPU partitions
+# while the second one is for matching cpuset.cpus.isolated.
+#
+SETUP_A123_PARTITIONS="C1-3:P1:S+ C2-3:P1:S+ C3:P1"
+TEST_MATRIX=(
+ # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+ # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+ " C0-1 . . C2-3 S+ C4-5 . . 0 A2:0-1"
+ " C0-1 . . C2-3 P1 . . . 0 "
+ " C0-1 . . C2-3 P1:S+ C0-1:P1 . . 0 "
+ " C0-1 . . C2-3 P1:S+ C1:P1 . . 0 "
+ " C0-1:S+ . . C2-3 . . . P1 0 "
+ " C0-1:P1 . . C2-3 S+ C1 . . 0 "
+ " C0-1:P1 . . C2-3 S+ C1:P1 . . 0 "
+ " C0-1:P1 . . C2-3 S+ C1:P1 . P1 0 "
+ " C0-1:P1 . . C2-3 C4-5 . . . 0 A1:4-5"
+ " C0-1:P1 . . C2-3 S+:C4-5 . . . 0 A1:4-5"
+ " C0-1 . . C2-3:P1 . . . C2 0 "
+ " C0-1 . . C2-3:P1 . . . C4-5 0 B1:4-5"
+ "C0-3:P1:S+ C2-3:P1 . . . . . . 0 A1:0-1|A2:2-3|XA2:2-3"
+ "C0-3:P1:S+ C2-3:P1 . . C1-3 . . . 0 A1:1|A2:2-3|XA2:2-3"
+ "C2-3:P1:S+ C3:P1 . . C3 . . . 0 A1:|A2:3|XA2:3 A1:P1|A2:P1"
+ "C2-3:P1:S+ C3:P1 . . C3 P0 . . 0 A1:3|A2:3 A1:P1|A2:P0"
+ "C2-3:P1:S+ C2:P1 . . C2-4 . . . 0 A1:3-4|A2:2"
+ "C2-3:P1:S+ C3:P1 . . C3 . . C0-2 0 A1:|B1:0-2 A1:P1|A2:P1"
+ "$SETUP_A123_PARTITIONS . C2-3 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
+
+ # CPU offlining cases:
+ " C0-1 . . C2-3 S+ C4-5 . O2=0 0 A1:0-1|B1:3"
+ "C0-3:P1:S+ C2-3:P1 . . O2=0 . . . 0 A1:0-1|A2:3"
+ "C0-3:P1:S+ C2-3:P1 . . O2=0 O2=1 . . 0 A1:0-1|A2:2-3"
+ "C0-3:P1:S+ C2-3:P1 . . O1=0 . . . 0 A1:0|A2:2-3"
+ "C0-3:P1:S+ C2-3:P1 . . O1=0 O1=1 . . 0 A1:0-1|A2:2-3"
+ "C2-3:P1:S+ C3:P1 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P1"
+ "C2-3:P1:S+ C3:P2 . . O3=0 O3=1 . . 0 A1:2|A2:3 A1:P1|A2:P2"
+ "C2-3:P1:S+ C3:P1 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P1"
+ "C2-3:P1:S+ C3:P2 . . O2=0 O2=1 . . 0 A1:2|A2:3 A1:P1|A2:P2"
+ "C2-3:P1:S+ C3:P1 . . O2=0 . . . 0 A1:|A2:3 A1:P1|A2:P1"
+ "C2-3:P1:S+ C3:P1 . . O3=0 . . . 0 A1:2|A2: A1:P1|A2:P1"
+ "C2-3:P1:S+ C3:P1 . . T:O2=0 . . . 0 A1:3|A2:3 A1:P1|A2:P-1"
+ "C2-3:P1:S+ C3:P1 . . . T:O3=0 . . 0 A1:2|A2:2 A1:P1|A2:P-1"
+ "$SETUP_A123_PARTITIONS . O1=0 . . . 0 A1:|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . O2=0 . . . 0 A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . O3=0 . . . 0 A1:1|A2:2|A3: A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . T:O1=0 . . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1"
+ "$SETUP_A123_PARTITIONS . . T:O2=0 . . 0 A1:1|A2:3|A3:3 A1:P1|A2:P1|A3:P-1"
+ "$SETUP_A123_PARTITIONS . . . T:O3=0 . 0 A1:1|A2:2|A3:2 A1:P1|A2:P1|A3:P-1"
+ "$SETUP_A123_PARTITIONS . T:O1=0 O1=1 . . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . . T:O2=0 O2=1 . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . . . T:O3=0 O3=1 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . T:O1=0 O2=0 O1=1 . 0 A1:1|A2:|A3:3 A1:P1|A2:P1|A3:P1"
+ "$SETUP_A123_PARTITIONS . T:O1=0 O2=0 O2=1 . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1"
+
+ # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+ # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+ #
+ # Remote partition and cpuset.cpus.exclusive tests
+ #
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:2-3"
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3:P2 . . 0 A1:0-1|A2:2-3|A3:2-3 A1:P0|A2:P2 2-3"
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X3:P2 . . 0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3"
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3"
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:C3 . 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3"
+ " C0-3:S+ C1-3:S+ C2-3 C2-3 . . . P2 0 A1:0-3|A2:1-3|A3:2-3|B1:2-3 A1:P0|A3:P0|B1:P-2"
+ " C0-3:S+ C1-3:S+ C2-3 C4-5 . . . P2 0 B1:4-5 B1:P2 4-5"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2:C1-3 P2 0 A3:2-3|B1:4 A3:P2|B1:P2 2-4"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X1-3 X1-3:P2 P2 . 0 A2:1|A3:2-3 A2:P2|A3:P2 1-3"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3 X2-3:P2 P2:C4-5 0 A3:2-3|B1:4-5 A3:P2|B1:P2 2-5"
+ " C4:X0-3:S+ X1-3:S+ X2-3 . . P2 . . 0 A1:4|A2:1-3|A3:1-3 A2:P2 1-3"
+ " C4:X0-3:S+ X1-3:S+ X2-3 . . . P2 . 0 A1:4|A2:4|A3:2-3 A3:P2 2-3"
+
+ # Nested remote/local partition tests
+ " C0-3:S+ C1-3:S+ C2-3 C4-5 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4-5 \
+ A1:P0|A2:P1|A3:P2|B1:P1 2-3"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:|A3:2-3|B1:4 \
+ A1:P0|A2:P1|A3:P2|B1:P1 2-4|2-3"
+ " C0-3:S+ C1-3:S+ C2-3 C4 X2-3 X2-3:P1 . P1 0 A1:0-1|A2:2-3|A3:2-3|B1:4 \
+ A1:P0|A2:P1|A3:P0|B1:P1"
+ " C0-3:S+ C1-3:S+ C3 C4 X2-3 X2-3:P1 P2 P1 0 A1:0-1|A2:2|A3:3|B1:4 \
+ A1:P0|A2:P1|A3:P2|B1:P1 2-4|3"
+ " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X4:P1 . 0 A1:0-1|A2:2-3|A3:4 \
+ A1:P0|A2:P2|A3:P1 2-4|2-3"
+ " C0-4:S+ C1-4:S+ C2-4 . X2-4 X2-4:P2 X3-4:P1 . 0 A1:0-1|A2:2|A3:3-4 \
+ A1:P0|A2:P2|A3:P1 2"
+ " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
+ . . X5 . . 0 A1:0-4|A2:1-4|A3:2-4 \
+ A1:P0|A2:P-2|A3:P-1 ."
+ " C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
+ . . . X1 . 0 A1:0-1|A2:2-4|A3:2-4 \
+ A1:P0|A2:P2|A3:P-1 2-4"
+
+ # Remote partition offline tests
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 . 0 A1:0-1|A2:1|A3:3 A1:P0|A3:P2 2-3"
+ " C0-3:S+ C1-3:S+ C2-3 . X2-3 X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3"
+ " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 P2:O3=0 . 0 A1:0-2|A2:1-2|A3: A1:P0|A3:P2 3"
+ " C0-3:S+ C1-3:S+ C3 . X2-3 X2-3 T:P2:O3=0 . 0 A1:0-2|A2:1-2|A3:1-2 A1:P0|A3:P-2 3|"
+
+ # An invalidated remote partition cannot self-recover from hotplug
+ " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3|A2:1-3|A3:2 A1:P0|A3:P-2 ."
+
+ # cpus.exclusive.effective clearing test
+ " C0-3:S+ C1-3:S+ C2 . X2-3:X . . . 0 A1:0-3|A2:1-3|A3:2|XA1:"
+
+ # Invalid to valid remote partition transition test
+ " C0-3:S+ C1-3 . . . X3:P2 . . 0 A1:0-3|A2:1-3|XA2: A2:P-2 ."
+ " C0-3:S+ C1-3:X3:P2
+ . . X2-3 P2 . . 0 A1:0-2|A2:3|XA2:3 A2:P2 3"
+
+ # Invalid to valid local partition direct transition tests
+ " C1-3:S+:P2 X4:P2 . . . . . . 0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3"
+ " C1-3:S+:P2 X4:P2 . . . X3:P2 . . 0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3"
+ " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4|B1:4-6 A1:P-2|B1:P0"
+ " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3"
+
+ # Local partition invalidation tests
+ " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
+ . . . . . 0 A1:1|A2:2|A3:3 A1:P2|A2:P2|A3:P2 1-3"
+ " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
+ . . X4 . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3"
+ " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
+ . . C4:X . . 0 A1:1-3|A2:1-3|A3:2-3|XA2:|XA3: A1:P2|A2:P-2|A3:P-2 1-3"
+ # Local partition CPU change tests
+ " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2|A2:3-5 A1:P2|A2:P1 0-2"
+ " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3|A2:4-5 A1:P2|A2:P1 1-3"
+
+ # cpus_allowed/exclusive_cpus update tests
+ " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+ . X:C4 . P2 . 0 A1:4|A2:4|XA2:|XA3:|A3:4 \
+ A1:P0|A3:P-2 ."
+ " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+ . X1 . P2 . 0 A1:0-3|A2:1-3|XA1:1|XA2:|XA3:|A3:2-3 \
+ A1:P0|A3:P-2 ."
+ " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
+ . . X3 P2 . 0 A1:0-2|A2:1-2|XA2:3|XA3:3|A3:3 \
+ A1:P0|A3:P2 3"
+ " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
+ . . X3 . . 0 A1:0-2|A2:1-2|XA2:3|XA3:3|A3:3|XA3:3 \
+ A1:P0|A3:P2 3"
+ " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3:P2 \
+ . X4 . . . 0 A1:0-3|A2:1-3|A3:2-3|XA1:4|XA2:|XA3 \
+ A1:P0|A3:P-2"
+
+ # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+ # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+ #
+ # Incorrect change to cpuset.cpus[.exclusive] invalidates partition root
+ #
+ # Adding CPUs to partition root that are not in parent's
+ # cpuset.cpus is allowed, but those extra CPUs are ignored.
+ "C2-3:P1:S+ C3:P1 . . . C2-4 . . 0 A1:|A2:2-3 A1:P1|A2:P1"
+
+ # Taking away all CPUs from parent or itself if there are tasks
+ # will make the partition invalid.
+ "C2-3:P1:S+ C3:P1 . . T C2-3 . . 0 A1:2-3|A2:2-3 A1:P1|A2:P-1"
+ " C3:P1:S+ C3 . . T P1 . . 0 A1:3|A2:3 A1:P1|A2:P-1"
+ "$SETUP_A123_PARTITIONS . T:C2-3 . . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P-1|A3:P-1"
+ "$SETUP_A123_PARTITIONS . T:C2-3:C1-3 . . . 0 A1:1|A2:2|A3:3 A1:P1|A2:P1|A3:P1"
+
+ # Changing a partition root to member makes child partitions invalid
+ "C2-3:P1:S+ C3:P1 . . P0 . . . 0 A1:2-3|A2:3 A1:P0|A2:P-1"
+ "$SETUP_A123_PARTITIONS . C2-3 P0 . . 0 A1:2-3|A2:2-3|A3:3 A1:P1|A2:P0|A3:P-1"
+
+ # cpuset.cpus can contains cpus not in parent's cpuset.cpus as long
+ # as they overlap.
+ "C2-3:P1:S+ . . . . C3-4:P1 . . 0 A1:2|A2:3 A1:P1|A2:P1"
+
+ # Deletion of CPUs distributed to child cgroup is allowed.
+ "C0-1:P1:S+ C1 . C2-3 C4-5 . . . 0 A1:4-5|A2:4-5"
+
+ # To become a valid partition root, cpuset.cpus must overlap parent's
+ # cpuset.cpus.
+ " C0-1:P1 . . C2-3 S+ C4-5:P1 . . 0 A1:0-1|A2:0-1 A1:P1|A2:P-1"
+
+ # Enabling partition with child cpusets is allowed
+ " C0-1:S+ C1 . C2-3 P1 . . . 0 A1:0-1|A2:1 A1:P1"
+
+ # A partition root with non-partition root parent is invalid| but it
+ # can be made valid if its parent becomes a partition root too.
+ " C0-1:S+ C1 . C2-3 . P2 . . 0 A1:0-1|A2:1 A1:P0|A2:P-2"
+ " C0-1:S+ C1:P2 . C2-3 P1 . . . 0 A1:0|A2:1 A1:P1|A2:P2 0-1|1"
+
+ # A non-exclusive cpuset.cpus change will invalidate partition and its siblings
+ " C0-1:P1 . . C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P0"
+ " C0-1:P1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P-1|B1:P-1"
+ " C0-1 . . P1:C2-3 C0-2 . . . 0 A1:0-2|B1:2-3 A1:P0|B1:P-1"
+
+ # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it
+ " C0-3 . . C4-5 X5 . . . 0 A1:0-3|B1:4-5"
+
+ # Child partition root that try to take all CPUs from parent partition
+ # with tasks will remain invalid.
+ " C1-4:P1:S+ P1 . . . . . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1"
+ " C1-4:P1:S+ P1 . . . C1-4 . . 0 A1|A2:1-4 A1:P1|A2:P1"
+ " C1-4:P1:S+ P1 . . T C1-4 . . 0 A1:1-4|A2:1-4 A1:P1|A2:P-1"
+
+ # Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't
+ # affect cpuset.cpus.exclusive.effective.
+ " C1-4:X3:S+ C1:X3 . . . C . . 0 A2:1-4|XA2:3"
+
+ # cpuset.cpus can contain CPUs that overlap a sibling cpuset with cpus.exclusive
+ # but creating a local partition out of it is not allowed. Similarly and change
+ # in cpuset.cpus of a local partition that overlaps sibling exclusive CPUs will
+ # invalidate it.
+ " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1 0 A1:1|A2:2-4|B1:5-6|XB1:5-6 \
+ A1:P0|A2:P2:B1:P1 2-4"
+ " CX1-4:S+ CX2-4:P2 . C3-6 . . . P1 0 A1:1|A2:2-4|B1:5-6 \
+ A1:P0|A2:P2:B1:P-1 2-4"
+ " CX1-4:S+ CX2-4:P2 . C5-6 . . . P1:C3-6 0 A1:1|A2:2-4|B1:5-6 \
+ A1:P0|A2:P2:B1:P-1 2-4"
+
+ # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
+ # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
+ # Failure cases:
+
+ # A task cannot be added to a partition with no cpu
+ "C2-3:P1:S+ C3:P1 . . O2=0:T . . . 1 A1:|A2:3 A1:P1|A2:P1"
+
+ # Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected
+ " C0-3 . . C4-5 X0-3 . . X3-5 1 A1:0-3|B1:4-5"
+
+ # cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive
+ " C0-3 . . C4-5 X3-5 . . . 1 A1:0-3|B1:4-5"
+)
+
+#
+# Cpuset controller remote partition test matrix.
+#
+# Cgroup test hierarchy
+#
+# root
+# |
+# rtest (cpuset.cpus.exclusive=1-7)
+# |
+# +------+------+
+# | |
+# p1 p2
+# +--+--+ +--+--+
+# | | | |
+# c11 c12 c21 c22
+#
+# REMOTE_TEST_MATRIX uses the same notational convention as TEST_MATRIX.
+# Only CPUs 1-7 should be used.
+#
+REMOTE_TEST_MATRIX=(
+ # old-p1 old-p2 old-c11 old-c12 old-c21 old-c22
+ # new-p1 new-p2 new-c11 new-c12 new-c21 new-c22 ECPUs Pstate ISOLCPUS
+ # ------ ------ ------- ------- ------- ------- ----- ------ --------
+ " X1-3:S+ X4-6:S+ X1-2 X3 X4-5 X6 \
+ . . P2 P2 P2 P2 c11:1-2|c12:3|c21:4-5|c22:6 \
+ c11:P2|c12:P2|c21:P2|c22:P2 1-6"
+ " CX1-4:S+ . X1-2:P2 C3 . . \
+ . . . C3-4 . . p1:3-4|c11:1-2|c12:3-4 \
+ p1:P0|c11:P2|c12:P0 1-2"
+ " CX1-4:S+ . X1-2:P2 . . . \
+ X2-4 . . . . . p1:1,3-4|c11:2 \
+ p1:P0|c11:P2 2"
+ " CX1-5:S+ . X1-2:P2 X3-5:P1 . . \
+ X2-4 . . . . . p1:1,5|c11:2|c12:3-4 \
+ p1:P0|c11:P2|c12:P1 2"
+ " CX1-4:S+ . X1-2:P2 X3-4:P1 . . \
+ . . X2 . . . p1:1|c11:2|c12:3-4 \
+ p1:P0|c11:P2|c12:P1 2"
+ # p1 as member, will get its effective CPUs from its parent rtest
+ " CX1-4:S+ . X1-2:P2 X3-4:P1 . . \
+ . . X1 CX2-4 . . p1:5-7|c11:1|c12:2-4 \
+ p1:P0|c11:P2|c12:P1 1"
+ " CX1-4:S+ X5-6:P1:S+ . . . . \
+ . . X1-2:P2 X4-5:P1 . X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \
+ p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \
+ 1-2,4-6|1-2,5-6"
+)
+
+#
+# Write to the cpu online file
+# $1 - <c>=<v> where <c> = cpu number, <v> value to be written
+#
+write_cpu_online()
+{
+ CPU=${1%=*}
+ VAL=${1#*=}
+ CPUFILE=//sys/devices/system/cpu/cpu${CPU}/online
+ if [[ $VAL -eq 0 ]]
+ then
+ OFFLINE_CPUS="$OFFLINE_CPUS $CPU"
+ else
+ [[ -n "$OFFLINE_CPUS" ]] && {
+ OFFLINE_CPUS=$(echo $CPU $CPU $OFFLINE_CPUS | fmt -1 |\
+ sort | uniq -u)
+ }
+ fi
+ echo $VAL > $CPUFILE
+ pause 0.05
+}
+
+#
+# Set controller state
+# $1 - cgroup directory
+# $2 - state
+# $3 - showerr
+#
+# The presence of ":" in state means transition from one to the next.
+#
+set_ctrl_state()
+{
+ TMPMSG=/tmp/.msg_$$
+ CGRP=$1
+ STATE=$2
+ SHOWERR=${3}
+ CTRL=${CTRL:=$CONTROLLER}
+ HASERR=0
+ REDIRECT="2> $TMPMSG"
+ [[ -z "$STATE" || "$STATE" = '.' ]] && return 0
+ [[ $VERBOSE -gt 0 ]] && SHOWERR=1
+
+ rm -f $TMPMSG
+ for CMD in $(echo $STATE | sed -e "s/:/ /g")
+ do
+ TFILE=$CGRP/cgroup.procs
+ SFILE=$CGRP/cgroup.subtree_control
+ PFILE=$CGRP/cpuset.cpus.partition
+ CFILE=$CGRP/cpuset.cpus
+ XFILE=$CGRP/cpuset.cpus.exclusive
+ case $CMD in
+ S*) PREFIX=${CMD#?}
+ COMM="echo ${PREFIX}${CTRL} > $SFILE"
+ eval $COMM $REDIRECT
+ ;;
+ X*)
+ CPUS=${CMD#?}
+ COMM="echo $CPUS > $XFILE"
+ eval $COMM $REDIRECT
+ ;;
+ CX*)
+ CPUS=${CMD#??}
+ COMM="echo $CPUS > $CFILE; echo $CPUS > $XFILE"
+ eval $COMM $REDIRECT
+ ;;
+ C*) CPUS=${CMD#?}
+ COMM="echo $CPUS > $CFILE"
+ eval $COMM $REDIRECT
+ ;;
+ P*) VAL=${CMD#?}
+ case $VAL in
+ 0) VAL=member
+ ;;
+ 1) VAL=root
+ ;;
+ 2) VAL=isolated
+ ;;
+ *)
+ echo "Invalid partition state - $VAL"
+ exit 1
+ ;;
+ esac
+ COMM="echo $VAL > $PFILE"
+ eval $COMM $REDIRECT
+ ;;
+ O*) VAL=${CMD#?}
+ write_cpu_online $VAL
+ ;;
+ T*) COMM="echo 0 > $TFILE"
+ eval $COMM $REDIRECT
+ ;;
+ *) echo "Unknown command: $CMD"
+ exit 1
+ ;;
+ esac
+ RET=$?
+ [[ $RET -ne 0 ]] && {
+ [[ -n "$SHOWERR" ]] && {
+ echo "$COMM"
+ cat $TMPMSG
+ }
+ HASERR=1
+ }
+ pause 0.01
+ rm -f $TMPMSG
+ done
+ return $HASERR
+}
+
+set_ctrl_state_noerr()
+{
+ CGRP=$1
+ STATE=$2
+ [[ -d $CGRP ]] || mkdir $CGRP
+ set_ctrl_state $CGRP $STATE 1
+ [[ $? -ne 0 ]] && {
+ echo "ERROR: Failed to set $2 to cgroup $1!"
+ exit 1
+ }
+}
+
+online_cpus()
+{
+ [[ -n "OFFLINE_CPUS" ]] && {
+ for C in $OFFLINE_CPUS
+ do
+ write_cpu_online ${C}=1
+ done
+ }
+}
+
+#
+# Remove all the test cgroup directories
+#
+reset_cgroup_states()
+{
+ echo 0 > $CGROUP2/cgroup.procs
+ online_cpus
+ rmdir $RESET_LIST > /dev/null 2>&1
+}
+
+dump_states()
+{
+ for DIR in $CGROUP_LIST
+ do
+ CPUS=$DIR/cpuset.cpus
+ ECPUS=$DIR/cpuset.cpus.effective
+ XCPUS=$DIR/cpuset.cpus.exclusive
+ XECPUS=$DIR/cpuset.cpus.exclusive.effective
+ PRS=$DIR/cpuset.cpus.partition
+ PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions
+ ISCPUS=$DIR/cpuset.cpus.isolated
+ [[ -e $CPUS ]] && echo "$CPUS: $(cat $CPUS)"
+ [[ -e $XCPUS ]] && echo "$XCPUS: $(cat $XCPUS)"
+ [[ -e $ECPUS ]] && echo "$ECPUS: $(cat $ECPUS)"
+ [[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)"
+ [[ -e $PRS ]] && echo "$PRS: $(cat $PRS)"
+ [[ -e $PCPUS ]] && echo "$PCPUS: $(cat $PCPUS)"
+ [[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)"
+ done
+}
+
+#
+# Set the actual cgroup directory into $CGRP_DIR
+# $1 - cgroup name
+#
+set_cgroup_dir()
+{
+ CGRP_DIR=$1
+ [[ $CGRP_DIR = A2 ]] && CGRP_DIR=A1/A2
+ [[ $CGRP_DIR = A3 ]] && CGRP_DIR=A1/A2/A3
+ [[ $CGRP_DIR = c11 ]] && CGRP_DIR=p1/c11
+ [[ $CGRP_DIR = c12 ]] && CGRP_DIR=p1/c12
+ [[ $CGRP_DIR = c21 ]] && CGRP_DIR=p2/c21
+ [[ $CGRP_DIR = c22 ]] && CGRP_DIR=p2/c22
+}
+
+#
+# Check effective cpus
+# $1 - check string, format: <cgroup>:<cpu-list>[|<cgroup>:<cpu-list>]*
+#
+check_effective_cpus()
+{
+ CHK_STR=$1
+ for CHK in $(echo $CHK_STR | sed -e "s/|/ /g")
+ do
+ set -- $(echo $CHK | sed -e "s/:/ /g")
+ CGRP=$1
+ EXPECTED_CPUS=$2
+ ACTUAL_CPUS=
+ if [[ $CGRP = X* ]]
+ then
+ CGRP=${CGRP#X}
+ FILE=cpuset.cpus.exclusive.effective
+ else
+ FILE=cpuset.cpus.effective
+ fi
+ set_cgroup_dir $CGRP
+ [[ -e $CGRP_DIR/$FILE ]] || return 1
+ ACTUAL_CPUS=$(cat $CGRP_DIR/$FILE)
+ [[ $EXPECTED_CPUS = $ACTUAL_CPUS ]] || return 1
+ done
+}
+
+#
+# Check cgroup states
+# $1 - check string, format: <cgroup>:<state>[|<cgroup>:<state>]*
+#
+check_cgroup_states()
+{
+ CHK_STR=$1
+ for CHK in $(echo $CHK_STR | sed -e "s/|/ /g")
+ do
+ set -- $(echo $CHK | sed -e "s/:/ /g")
+ CGRP=$1
+ EXPECTED_STATE=$2
+ FILE=
+ EVAL=$(expr substr $EXPECTED_STATE 2 2)
+
+ set_cgroup_dir $CGRP
+ case $EXPECTED_STATE in
+ P*) FILE=$CGRP_DIR/cpuset.cpus.partition
+ ;;
+ *) echo "Unknown state: $EXPECTED_STATE!"
+ exit 1
+ ;;
+ esac
+ ACTUAL_STATE=$(cat $FILE)
+
+ case "$ACTUAL_STATE" in
+ member) VAL=0
+ ;;
+ root) VAL=1
+ ;;
+ isolated)
+ VAL=2
+ ;;
+ "root invalid"*)
+ VAL=-1
+ ;;
+ "isolated invalid"*)
+ VAL=-2
+ ;;
+ esac
+ [[ $EVAL != $VAL ]] && return 1
+
+ #
+ # For root partition, dump sched-domains info to console if
+ # verbose mode set for manual comparison with sched debug info.
+ #
+ [[ $VAL -eq 1 && $VERBOSE -gt 0 ]] && {
+ DOMS=$(cat $CGRP_DIR/cpuset.cpus.effective)
+ [[ -n "$DOMS" ]] &&
+ echo " [$CGRP_DIR] sched-domain: $DOMS" > $CONSOLE
+ }
+ done
+ return 0
+}
+
+#
+# Get isolated (including offline) CPUs by looking at
+# /sys/kernel/debug/sched/domains and cpuset.cpus.isolated control file,
+# if available, and compare that with the expected value.
+#
+# Note that isolated CPUs from the sched/domains context include offline
+# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may
+# not be included in the cpuset.cpus.isolated control file which contains
+# only CPUs in isolated partitions as well as those that are isolated at
+# boot time.
+#
+# $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>}
+# <isolcpus1> - expected sched/domains value
+# <isolcpus2> - cpuset.cpus.isolated value = <isolcpus1> if not defined
+#
+check_isolcpus()
+{
+ EXPECTED_ISOLCPUS=$1
+ ISCPUS=${CGROUP2}/cpuset.cpus.isolated
+ ISOLCPUS=$(cat $ISCPUS)
+ LASTISOLCPU=
+ SCHED_DOMAINS=/sys/kernel/debug/sched/domains
+ if [[ $EXPECTED_ISOLCPUS = . ]]
+ then
+ EXPECTED_ISOLCPUS=
+ EXPECTED_SDOMAIN=
+ elif [[ $(expr $EXPECTED_ISOLCPUS : ".*|.*") > 0 ]]
+ then
+ set -- $(echo $EXPECTED_ISOLCPUS | sed -e "s/|/ /g")
+ EXPECTED_ISOLCPUS=$2
+ EXPECTED_SDOMAIN=$1
+ else
+ EXPECTED_SDOMAIN=$EXPECTED_ISOLCPUS
+ fi
+
+ #
+ # Appending pre-isolated CPUs
+ # Even though CPU #8 isn't used for testing, it can't be pre-isolated
+ # to make appending those CPUs easier.
+ #
+ [[ -n "$BOOT_ISOLCPUS" ]] && {
+ EXPECTED_ISOLCPUS=${EXPECTED_ISOLCPUS:+${EXPECTED_ISOLCPUS},}${BOOT_ISOLCPUS}
+ EXPECTED_SDOMAIN=${EXPECTED_SDOMAIN:+${EXPECTED_SDOMAIN},}${BOOT_ISOLCPUS}
+ }
+
+ #
+ # Check cpuset.cpus.isolated cpumask
+ #
+ [[ "$EXPECTED_ISOLCPUS" != "$ISOLCPUS" ]] && {
+ # Take a 50ms pause and try again
+ pause 0.05
+ ISOLCPUS=$(cat $ISCPUS)
+ }
+ [[ "$EXPECTED_ISOLCPUS" != "$ISOLCPUS" ]] && return 1
+ ISOLCPUS=
+ EXPECTED_ISOLCPUS=$EXPECTED_SDOMAIN
+
+ #
+ # Use the sched domain in debugfs to check isolated CPUs, if available
+ #
+ [[ -d $SCHED_DOMAINS ]] || return 0
+
+ for ((CPU=0; CPU < $NR_CPUS; CPU++))
+ do
+ [[ -n "$(ls ${SCHED_DOMAINS}/cpu$CPU)" ]] && continue
+
+ if [[ -z "$LASTISOLCPU" ]]
+ then
+ ISOLCPUS=$CPU
+ LASTISOLCPU=$CPU
+ elif [[ "$LASTISOLCPU" -eq $((CPU - 1)) ]]
+ then
+ echo $ISOLCPUS | grep -q "\<$LASTISOLCPU\$"
+ if [[ $? -eq 0 ]]
+ then
+ ISOLCPUS=${ISOLCPUS}-
+ fi
+ LASTISOLCPU=$CPU
+ else
+ if [[ $ISOLCPUS = *- ]]
+ then
+ ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
+ fi
+ ISOLCPUS=${ISOLCPUS},$CPU
+ LASTISOLCPU=$CPU
+ fi
+ done
+ [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU
+
+ [[ "$EXPECTED_SDOMAIN" = "$ISOLCPUS" ]]
+}
+
+test_fail()
+{
+ TESTNUM=$1
+ TESTTYPE=$2
+ ADDINFO=$3
+ echo "Test $TEST[$TESTNUM] failed $TESTTYPE check!"
+ [[ -n "$ADDINFO" ]] && echo "*** $ADDINFO ***"
+ eval echo \${$TEST[$I]}
+ echo
+ dump_states
+ exit 1
+}
+
+#
+# Check to see if there are unexpected isolated CPUs left beyond the boot
+# time isolated ones.
+#
+null_isolcpus_check()
+{
+ [[ $VERBOSE -gt 0 ]] || return 0
+ # Retry a few times before printing error
+ RETRY=0
+ while [[ $RETRY -lt 8 ]]
+ do
+ pause 0.02
+ check_isolcpus "."
+ [[ $? -eq 0 ]] && return 0
+ ((RETRY++))
+ done
+ echo "Unexpected isolated CPUs: $ISOLCPUS"
+ dump_states
+ exit 1
+}
+
+#
+# Check state transition test result
+# $1 - Test number
+# $2 - Expected effective CPU values
+# $3 - Expected partition states
+# $4 - Expected isolated CPUs
+#
+check_test_results()
+{
+ _NR=$1
+ _ECPUS="$2"
+ _PSTATES="$3"
+ _ISOLCPUS="$4"
+
+ [[ -n "$_ECPUS" && "$_ECPUS" != . ]] && {
+ check_effective_cpus $_ECPUS
+ [[ $? -ne 0 ]] && test_fail $_NR "effective CPU" \
+ "Cgroup $CGRP: expected $EXPECTED_CPUS, got $ACTUAL_CPUS"
+ }
+
+ [[ -n "$_PSTATES" && "$_PSTATES" != . ]] && {
+ check_cgroup_states $_PSTATES
+ [[ $? -ne 0 ]] && test_fail $_NR states \
+ "Cgroup $CGRP: expected $EXPECTED_STATE, got $ACTUAL_STATE"
+ }
+
+ # Compare the expected isolated CPUs with the actual ones,
+ # if available
+ [[ -n "$_ISOLCPUS" ]] && {
+ check_isolcpus $_ISOLCPUS
+ [[ $? -ne 0 ]] && {
+ [[ -n "$BOOT_ISOLCPUS" ]] && _ISOLCPUS=${_ISOLCPUS},${BOOT_ISOLCPUS}
+ test_fail $_NR "isolated CPU" \
+ "Expect $_ISOLCPUS, get $ISOLCPUS instead"
+ }
+ }
+ reset_cgroup_states
+ #
+ # Check to see if effective cpu list changes
+ #
+ _NEWLIST=$(cat $CGROUP2/cpuset.cpus.effective)
+ RETRY=0
+ while [[ $_NEWLIST != $CPULIST && $RETRY -lt 8 ]]
+ do
+ # Wait a bit longer & recheck a few times
+ pause 0.02
+ ((RETRY++))
+ _NEWLIST=$(cat $CGROUP2/cpuset.cpus.effective)
+ done
+ [[ $_NEWLIST != $CPULIST ]] && {
+ echo "Effective cpus changed to $_NEWLIST after test $_NR!"
+ exit 1
+ }
+ null_isolcpus_check
+ [[ $VERBOSE -gt 0 ]] && echo "Test $I done."
+}
+
+#
+# Run cpuset state transition test
+# $1 - test matrix name
+#
+# This test is somewhat fragile as delays (sleep x) are added in various
+# places to make sure state changes are fully propagated before the next
+# action. These delays may need to be adjusted if running in a slower machine.
+#
+run_state_test()
+{
+ TEST=$1
+ CONTROLLER=cpuset
+ CGROUP_LIST=". A1 A1/A2 A1/A2/A3 B1"
+ RESET_LIST="A1/A2/A3 A1/A2 A1 B1"
+ I=0
+ eval CNT="\${#$TEST[@]}"
+
+ reset_cgroup_states
+ console_msg "Running state transition test ..."
+
+ while [[ $I -lt $CNT ]]
+ do
+ echo "Running test $I ..." > $CONSOLE
+ [[ $VERBOSE -gt 1 ]] && {
+ echo ""
+ eval echo \${$TEST[$I]}
+ }
+ eval set -- "\${$TEST[$I]}"
+ OLD_A1=$1
+ OLD_A2=$2
+ OLD_A3=$3
+ OLD_B1=$4
+ NEW_A1=$5
+ NEW_A2=$6
+ NEW_A3=$7
+ NEW_B1=$8
+ RESULT=$9
+ ECPUS=${10}
+ STATES=${11}
+ ICPUS=${12}
+
+ set_ctrl_state_noerr A1 $OLD_A1
+ set_ctrl_state_noerr A1/A2 $OLD_A2
+ set_ctrl_state_noerr A1/A2/A3 $OLD_A3
+ set_ctrl_state_noerr B1 $OLD_B1
+
+ RETVAL=0
+ set_ctrl_state A1 $NEW_A1; ((RETVAL += $?))
+ set_ctrl_state A1/A2 $NEW_A2; ((RETVAL += $?))
+ set_ctrl_state A1/A2/A3 $NEW_A3; ((RETVAL += $?))
+ set_ctrl_state B1 $NEW_B1; ((RETVAL += $?))
+
+ [[ $RETVAL -ne $RESULT ]] && test_fail $I result
+
+ check_test_results $I "$ECPUS" "$STATES" "$ICPUS"
+ ((I++))
+ done
+ echo "All $I tests of $TEST PASSED."
+}
+
+#
+# Run cpuset remote partition state transition test
+# $1 - test matrix name
+#
+run_remote_state_test()
+{
+ TEST=$1
+ CONTROLLER=cpuset
+ [[ -d rtest ]] || mkdir rtest
+ cd rtest
+ echo +cpuset > cgroup.subtree_control
+ echo "1-7" > cpuset.cpus
+ echo "1-7" > cpuset.cpus.exclusive
+ CGROUP_LIST=".. . p1 p2 p1/c11 p1/c12 p2/c21 p2/c22"
+ RESET_LIST="p1/c11 p1/c12 p2/c21 p2/c22 p1 p2"
+ I=0
+ eval CNT="\${#$TEST[@]}"
+
+ reset_cgroup_states
+ console_msg "Running remote partition state transition test ..."
+
+ while [[ $I -lt $CNT ]]
+ do
+ echo "Running test $I ..." > $CONSOLE
+ [[ $VERBOSE -gt 1 ]] && {
+ echo ""
+ eval echo \${$TEST[$I]}
+ }
+ eval set -- "\${$TEST[$I]}"
+ OLD_p1=$1
+ OLD_p2=$2
+ OLD_c11=$3
+ OLD_c12=$4
+ OLD_c21=$5
+ OLD_c22=$6
+ NEW_p1=$7
+ NEW_p2=$8
+ NEW_c11=$9
+ NEW_c12=${10}
+ NEW_c21=${11}
+ NEW_c22=${12}
+ ECPUS=${13}
+ STATES=${14}
+ ICPUS=${15}
+
+ set_ctrl_state_noerr p1 $OLD_p1
+ set_ctrl_state_noerr p2 $OLD_p2
+ set_ctrl_state_noerr p1/c11 $OLD_c11
+ set_ctrl_state_noerr p1/c12 $OLD_c12
+ set_ctrl_state_noerr p2/c21 $OLD_c21
+ set_ctrl_state_noerr p2/c22 $OLD_c22
+
+ RETVAL=0
+ set_ctrl_state p1 $NEW_p1 ; ((RETVAL += $?))
+ set_ctrl_state p2 $NEW_p2 ; ((RETVAL += $?))
+ set_ctrl_state p1/c11 $NEW_c11; ((RETVAL += $?))
+ set_ctrl_state p1/c12 $NEW_c12; ((RETVAL += $?))
+ set_ctrl_state p2/c21 $NEW_c21; ((RETVAL += $?))
+ set_ctrl_state p2/c22 $NEW_c22; ((RETVAL += $?))
+
+ [[ $RETVAL -ne 0 ]] && test_fail $I result
+
+ check_test_results $I "$ECPUS" "$STATES" "$ICPUS"
+ ((I++))
+ done
+ cd ..
+ rmdir rtest
+ echo "All $I tests of $TEST PASSED."
+}
+
+#
+# Testing the new "isolated" partition root type
+#
+test_isolated()
+{
+ cd $CGROUP2/test
+ echo 2-3 > cpuset.cpus
+ TYPE=$(cat cpuset.cpus.partition)
+ [[ $TYPE = member ]] || echo member > cpuset.cpus.partition
+
+ console_msg "Change from member to root"
+ test_partition root
+
+ console_msg "Change from root to isolated"
+ test_partition isolated
+
+ console_msg "Change from isolated to member"
+ test_partition member
+
+ console_msg "Change from member to isolated"
+ test_partition isolated
+
+ console_msg "Change from isolated to root"
+ test_partition root
+
+ console_msg "Change from root to member"
+ test_partition member
+
+ #
+ # Testing partition root with no cpu
+ #
+ console_msg "Distribute all cpus to child partition"
+ echo +cpuset > cgroup.subtree_control
+ test_partition root
+
+ mkdir A1
+ cd A1
+ echo 2-3 > cpuset.cpus
+ test_partition root
+ test_effective_cpus 2-3
+ cd ..
+ test_effective_cpus ""
+
+ console_msg "Moving task to partition test"
+ test_add_proc "No space left"
+ cd A1
+ test_add_proc ""
+ cd ..
+
+ console_msg "Shrink and expand child partition"
+ cd A1
+ echo 2 > cpuset.cpus
+ cd ..
+ test_effective_cpus 3
+ cd A1
+ echo 2-3 > cpuset.cpus
+ cd ..
+ test_effective_cpus ""
+
+ # Cleaning up
+ console_msg "Cleaning up"
+ echo $$ > $CGROUP2/cgroup.procs
+ [[ -d A1 ]] && rmdir A1
+ null_isolcpus_check
+ pause 0.05
+}
+
+#
+# Wait for inotify event for the given file and read it
+# $1: cgroup file to wait for
+# $2: file to store the read result
+#
+wait_inotify()
+{
+ CGROUP_FILE=$1
+ OUTPUT_FILE=$2
+
+ $WAIT_INOTIFY $CGROUP_FILE
+ cat $CGROUP_FILE > $OUTPUT_FILE
+}
+
+#
+# Test if inotify events are properly generated when going into and out of
+# invalid partition state.
+#
+test_inotify()
+{
+ ERR=0
+ PRS=/tmp/.prs_$$
+ cd $CGROUP2/test
+ [[ -f $WAIT_INOTIFY ]] || {
+ echo "wait_inotify not found, inotify test SKIPPED."
+ return
+ }
+
+ pause 0.01
+ echo 1 > cpuset.cpus
+ echo 0 > cgroup.procs
+ echo root > cpuset.cpus.partition
+ pause 0.01
+ rm -f $PRS
+ wait_inotify $PWD/cpuset.cpus.partition $PRS &
+ pause 0.01
+ set_ctrl_state . "O1=0"
+ pause 0.01
+ check_cgroup_states ".:P-1"
+ if [[ $? -ne 0 ]]
+ then
+ echo "FAILED: Inotify test - partition not invalid"
+ ERR=1
+ elif [[ ! -f $PRS ]]
+ then
+ echo "FAILED: Inotify test - event not generated"
+ ERR=1
+ kill %1
+ elif [[ $(cat $PRS) != "root invalid"* ]]
+ then
+ echo "FAILED: Inotify test - incorrect state"
+ cat $PRS
+ ERR=1
+ fi
+ online_cpus
+ echo member > cpuset.cpus.partition
+ echo 0 > ../cgroup.procs
+ if [[ $ERR -ne 0 ]]
+ then
+ exit 1
+ else
+ echo "Inotify test PASSED"
+ fi
+ echo member > cpuset.cpus.partition
+ echo "" > cpuset.cpus
+}
+
+trap cleanup 0 2 3 6
+run_state_test TEST_MATRIX
+run_remote_state_test REMOTE_TEST_MATRIX
+test_isolated
+test_inotify
+echo "All tests PASSED."
diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
new file mode 100755
index 000000000000..42a6628fb8bc
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Basc test for cpuset v1 interfaces write/read
+#
+
+skip_test() {
+ echo "$1"
+ echo "Test SKIPPED"
+ exit 4 # ksft_skip
+}
+
+write_test() {
+ dir=$1
+ interface=$2
+ value=$3
+ original=$(cat $dir/$interface)
+ echo "testing $interface $value"
+ echo $value > $dir/$interface
+ new=$(cat $dir/$interface)
+ [[ $value -ne $(cat $dir/$interface) ]] && {
+ echo "$interface write $value failed: new:$new"
+ exit 1
+ }
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+# Find cpuset v1 mount point
+CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}')
+[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!"
+
+#
+# Create a test cpuset, read write test
+#
+TDIR=test$$
+[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR
+
+ITF_MATRIX=(
+ #interface value expect root_only
+ 'cpuset.cpus 0-1 0-1 0'
+ 'cpuset.mem_exclusive 1 1 0'
+ 'cpuset.mem_exclusive 0 0 0'
+ 'cpuset.mem_hardwall 1 1 0'
+ 'cpuset.mem_hardwall 0 0 0'
+ 'cpuset.memory_migrate 1 1 0'
+ 'cpuset.memory_migrate 0 0 0'
+ 'cpuset.memory_spread_page 1 1 0'
+ 'cpuset.memory_spread_page 0 0 0'
+ 'cpuset.memory_spread_slab 1 1 0'
+ 'cpuset.memory_spread_slab 0 0 0'
+ 'cpuset.mems 0 0 0'
+ 'cpuset.sched_load_balance 1 1 0'
+ 'cpuset.sched_load_balance 0 0 0'
+ 'cpuset.sched_relax_domain_level 2 2 0'
+ 'cpuset.memory_pressure_enabled 1 1 1'
+ 'cpuset.memory_pressure_enabled 0 0 1'
+)
+
+run_test()
+{
+ cnt="${ITF_MATRIX[@]}"
+ for i in "${ITF_MATRIX[@]}" ; do
+ args=($i)
+ root_only=${args[3]}
+ [[ $root_only -eq 1 ]] && {
+ write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}"
+ continue
+ }
+ write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}"
+ done
+}
+
+run_test
+rmdir $CPUSET/$TDIR
+echo "Test PASSED"
+exit 0
diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh
new file mode 100755
index 000000000000..7406c24be1ac
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test the special cpuset v1 hotplug case where a cpuset become empty of
+# CPUs will force migration of tasks out to an ancestor.
+#
+
+skip_test() {
+ echo "$1"
+ echo "Test SKIPPED"
+ exit 4 # ksft_skip
+}
+
+[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
+
+# Find cpuset v1 mount point
+CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk -e '{print $3}')
+[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!"
+
+#
+# Create a test cpuset, put a CPU and a task there and offline that CPU
+#
+TDIR=test$$
+[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR
+echo 1 > $CPUSET/$TDIR/cpuset.cpus
+echo 0 > $CPUSET/$TDIR/cpuset.mems
+sleep 10&
+TASK=$!
+echo $TASK > $CPUSET/$TDIR/tasks
+NEWCS=$(cat /proc/$TASK/cpuset)
+[[ $NEWCS != "/$TDIR" ]] && {
+ echo "Unexpected cpuset $NEWCS, test FAILED!"
+ exit 1
+}
+
+echo 0 > /sys/devices/system/cpu/cpu1/online
+sleep 0.5
+echo 1 > /sys/devices/system/cpu/cpu1/online
+NEWCS=$(cat /proc/$TASK/cpuset)
+rmdir $CPUSET/$TDIR
+[[ $NEWCS != "/" ]] && {
+ echo "cpuset $NEWCS, test FAILED!"
+ exit 1
+}
+echo "Test PASSED"
+exit 0
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
index 23d8fa4a3e4e..8730645d363a 100644
--- a/tools/testing/selftests/cgroup/test_freezer.c
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -7,9 +7,7 @@
#include <unistd.h>
#include <stdio.h>
#include <errno.h>
-#include <poll.h>
#include <stdlib.h>
-#include <sys/inotify.h>
#include <string.h>
#include <sys/wait.h>
@@ -55,61 +53,6 @@ static int cg_freeze_nowait(const char *cgroup, bool freeze)
}
/*
- * Prepare for waiting on cgroup.events file.
- */
-static int cg_prepare_for_wait(const char *cgroup)
-{
- int fd, ret = -1;
-
- fd = inotify_init1(0);
- if (fd == -1) {
- debug("Error: inotify_init1() failed\n");
- return fd;
- }
-
- ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
- IN_MODIFY);
- if (ret == -1) {
- debug("Error: inotify_add_watch() failed\n");
- close(fd);
- fd = -1;
- }
-
- return fd;
-}
-
-/*
- * Wait for an event. If there are no events for 10 seconds,
- * treat this an error.
- */
-static int cg_wait_for(int fd)
-{
- int ret = -1;
- struct pollfd fds = {
- .fd = fd,
- .events = POLLIN,
- };
-
- while (true) {
- ret = poll(&fds, 1, 10000);
-
- if (ret == -1) {
- if (errno == EINTR)
- continue;
- debug("Error: poll() failed\n");
- break;
- }
-
- if (ret > 0 && fds.revents & POLLIN) {
- ret = 0;
- break;
- }
- }
-
- return ret;
-}
-
-/*
* Attach a task to the given cgroup and wait for a cgroup frozen event.
* All transient events (e.g. populated) are ignored.
*/
@@ -797,7 +740,7 @@ static int test_cgfreezer_ptraced(const char *root)
/*
* cg_check_frozen(cgroup, true) will fail here,
- * because the task in in the TRACEd state.
+ * because the task is in the TRACEd state.
*/
if (cg_freeze_wait(cgroup, false))
goto cleanup;
@@ -884,7 +827,7 @@ int main(int argc, char *argv[])
char root[PATH_MAX];
int i, ret = EXIT_SUCCESS;
- if (cg_find_unified_root(root, sizeof(root)))
+ if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
switch (tests[i].fn(root)) {
diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c
new file mode 100644
index 000000000000..856f9508ea56
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+#define ADDR ((void *)(0x0UL))
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+/* mapping 8 MBs == 4 hugepages */
+#define LENGTH (8UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* borrowed from mm/hmm-tests.c */
+static long get_hugepage_size(void)
+{
+ int fd;
+ char buf[2048];
+ int len;
+ char *p, *q, *path = "/proc/meminfo", *tag = "Hugepagesize:";
+ long val;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ /* Error opening the file */
+ return -1;
+ }
+
+ len = read(fd, buf, sizeof(buf));
+ close(fd);
+ if (len < 0) {
+ /* Error in reading the file */
+ return -1;
+ }
+ if (len == sizeof(buf)) {
+ /* Error file is too large */
+ return -1;
+ }
+ buf[len] = '\0';
+
+ /* Search for a tag if provided */
+ if (tag) {
+ p = strstr(buf, tag);
+ if (!p)
+ return -1; /* looks like the line we want isn't there */
+ p += strlen(tag);
+ } else
+ p = buf;
+
+ val = strtol(p, &q, 0);
+ if (*q != ' ') {
+ /* Error parsing the file */
+ return -1;
+ }
+
+ return val;
+}
+
+static int set_file(const char *path, long value)
+{
+ FILE *file;
+ int ret;
+
+ file = fopen(path, "w");
+ if (!file)
+ return -1;
+ ret = fprintf(file, "%ld\n", value);
+ fclose(file);
+ return ret;
+}
+
+static int set_nr_hugepages(long value)
+{
+ return set_file("/proc/sys/vm/nr_hugepages", value);
+}
+
+static unsigned int check_first(char *addr)
+{
+ return *(unsigned int *)addr;
+}
+
+static void write_data(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+static int hugetlb_test_program(const char *cgroup, void *arg)
+{
+ char *test_group = (char *)arg;
+ void *addr;
+ long old_current, expected_current, current;
+ int ret = EXIT_FAILURE;
+
+ old_current = cg_read_long(test_group, "memory.current");
+ set_nr_hugepages(20);
+ current = cg_read_long(test_group, "memory.current");
+ if (current - old_current >= MB(2)) {
+ ksft_print_msg(
+ "setting nr_hugepages should not increase hugepage usage.\n");
+ ksft_print_msg("before: %ld, after: %ld\n", old_current, current);
+ return EXIT_FAILURE;
+ }
+
+ addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
+ if (addr == MAP_FAILED) {
+ ksft_print_msg("fail to mmap.\n");
+ return EXIT_FAILURE;
+ }
+ current = cg_read_long(test_group, "memory.current");
+ if (current - old_current >= MB(2)) {
+ ksft_print_msg("mmap should not increase hugepage usage.\n");
+ ksft_print_msg("before: %ld, after: %ld\n", old_current, current);
+ goto out_failed_munmap;
+ }
+ old_current = current;
+
+ /* read the first page */
+ check_first(addr);
+ expected_current = old_current + MB(2);
+ current = cg_read_long(test_group, "memory.current");
+ if (!values_close(expected_current, current, 5)) {
+ ksft_print_msg("memory usage should increase by around 2MB.\n");
+ ksft_print_msg(
+ "expected memory: %ld, actual memory: %ld\n",
+ expected_current, current);
+ goto out_failed_munmap;
+ }
+
+ /* write to the whole range */
+ write_data(addr);
+ current = cg_read_long(test_group, "memory.current");
+ expected_current = old_current + MB(8);
+ if (!values_close(expected_current, current, 5)) {
+ ksft_print_msg("memory usage should increase by around 8MB.\n");
+ ksft_print_msg(
+ "expected memory: %ld, actual memory: %ld\n",
+ expected_current, current);
+ goto out_failed_munmap;
+ }
+
+ /* unmap the whole range */
+ munmap(addr, LENGTH);
+ current = cg_read_long(test_group, "memory.current");
+ expected_current = old_current;
+ if (!values_close(expected_current, current, 5)) {
+ ksft_print_msg("memory usage should go back down.\n");
+ ksft_print_msg(
+ "expected memory: %ld, actual memory: %ld\n",
+ expected_current, current);
+ return ret;
+ }
+
+ ret = EXIT_SUCCESS;
+ return ret;
+
+out_failed_munmap:
+ munmap(addr, LENGTH);
+ return ret;
+}
+
+static int test_hugetlb_memcg(char *root)
+{
+ int ret = KSFT_FAIL;
+ char *test_group;
+
+ test_group = cg_name(root, "hugetlb_memcg_test");
+ if (!test_group || cg_create(test_group)) {
+ ksft_print_msg("fail to create cgroup.\n");
+ goto out;
+ }
+
+ if (cg_write(test_group, "memory.max", "100M")) {
+ ksft_print_msg("fail to set cgroup memory limit.\n");
+ goto out;
+ }
+
+ /* disable swap */
+ if (cg_write(test_group, "memory.swap.max", "0")) {
+ ksft_print_msg("fail to disable swap.\n");
+ goto out;
+ }
+
+ if (!cg_run(test_group, hugetlb_test_program, (void *)test_group))
+ ret = KSFT_PASS;
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int ret = EXIT_SUCCESS, has_memory_hugetlb_acc;
+
+ has_memory_hugetlb_acc = proc_mount_contains("memory_hugetlb_accounting");
+ if (has_memory_hugetlb_acc < 0)
+ ksft_exit_skip("Failed to query cgroup mount option\n");
+ else if (!has_memory_hugetlb_acc)
+ ksft_exit_skip("memory hugetlb accounting is disabled\n");
+
+ /* Unit is kB! */
+ if (get_hugepage_size() != 2048) {
+ ksft_print_msg("test_hugetlb_memcg requires 2MB hugepages\n");
+ ksft_test_result_skip("test_hugetlb_memcg\n");
+ return ret;
+ }
+
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ switch (test_hugetlb_memcg(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("test_hugetlb_memcg\n");
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("test_hugetlb_memcg\n");
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("test_hugetlb_memcg\n");
+ break;
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c
new file mode 100644
index 000000000000..0e5bb6c7307a
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_kill.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <errno.h>
+#include <linux/limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "../pidfd/pidfd.h"
+#include "cgroup_util.h"
+
+/*
+ * Kill the given cgroup and wait for the inotify signal.
+ * If there are no events in 10 seconds, treat this as an error.
+ * Then check that the cgroup is in the desired state.
+ */
+static int cg_kill_wait(const char *cgroup)
+{
+ int fd, ret = -1;
+
+ fd = cg_prepare_for_wait(cgroup);
+ if (fd < 0)
+ return fd;
+
+ ret = cg_write(cgroup, "cgroup.kill", "1");
+ if (ret)
+ goto out;
+
+ ret = cg_wait_for(fd);
+ if (ret)
+ goto out;
+
+out:
+ close(fd);
+ return ret;
+}
+
+/*
+ * A simple process running in a sleep loop until being
+ * re-parented.
+ */
+static int child_fn(const char *cgroup, void *arg)
+{
+ int ppid = getppid();
+
+ while (getppid() == ppid)
+ usleep(1000);
+
+ return getppid() == ppid;
+}
+
+static int test_cgkill_simple(const char *root)
+{
+ pid_t pids[100];
+ int ret = KSFT_FAIL;
+ char *cgroup = NULL;
+ int i;
+
+ cgroup = cg_name(root, "cg_test_simple");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ for (i = 0; i < 100; i++)
+ pids[i] = cg_run_nowait(cgroup, child_fn, NULL);
+
+ if (cg_wait_for_proc_count(cgroup, 100))
+ goto cleanup;
+
+ if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n"))
+ goto cleanup;
+
+ if (cg_kill_wait(cgroup))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ for (i = 0; i < 100; i++)
+ wait_for_pid(pids[i]);
+
+ if (ret == KSFT_PASS &&
+ cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+ ret = KSFT_FAIL;
+
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+/*
+ * The test creates the following hierarchy:
+ * A
+ * / / \ \
+ * B E I K
+ * /\ |
+ * C D F
+ * |
+ * G
+ * |
+ * H
+ *
+ * with a process in C, H and 3 processes in K.
+ * Then it tries to kill the whole tree.
+ */
+static int test_cgkill_tree(const char *root)
+{
+ pid_t pids[5];
+ char *cgroup[10] = {0};
+ int ret = KSFT_FAIL;
+ int i;
+
+ cgroup[0] = cg_name(root, "cg_test_tree_A");
+ if (!cgroup[0])
+ goto cleanup;
+
+ cgroup[1] = cg_name(cgroup[0], "B");
+ if (!cgroup[1])
+ goto cleanup;
+
+ cgroup[2] = cg_name(cgroup[1], "C");
+ if (!cgroup[2])
+ goto cleanup;
+
+ cgroup[3] = cg_name(cgroup[1], "D");
+ if (!cgroup[3])
+ goto cleanup;
+
+ cgroup[4] = cg_name(cgroup[0], "E");
+ if (!cgroup[4])
+ goto cleanup;
+
+ cgroup[5] = cg_name(cgroup[4], "F");
+ if (!cgroup[5])
+ goto cleanup;
+
+ cgroup[6] = cg_name(cgroup[5], "G");
+ if (!cgroup[6])
+ goto cleanup;
+
+ cgroup[7] = cg_name(cgroup[6], "H");
+ if (!cgroup[7])
+ goto cleanup;
+
+ cgroup[8] = cg_name(cgroup[0], "I");
+ if (!cgroup[8])
+ goto cleanup;
+
+ cgroup[9] = cg_name(cgroup[0], "K");
+ if (!cgroup[9])
+ goto cleanup;
+
+ for (i = 0; i < 10; i++)
+ if (cg_create(cgroup[i]))
+ goto cleanup;
+
+ pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL);
+ pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL);
+ pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL);
+ pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL);
+ pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL);
+
+ /*
+ * Wait until all child processes will enter
+ * corresponding cgroups.
+ */
+
+ if (cg_wait_for_proc_count(cgroup[2], 1) ||
+ cg_wait_for_proc_count(cgroup[7], 1) ||
+ cg_wait_for_proc_count(cgroup[9], 3))
+ goto cleanup;
+
+ /*
+ * Kill A and check that we get an empty notification.
+ */
+ if (cg_kill_wait(cgroup[0]))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ for (i = 0; i < 5; i++)
+ wait_for_pid(pids[i]);
+
+ if (ret == KSFT_PASS &&
+ cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
+ ret = KSFT_FAIL;
+
+ for (i = 9; i >= 0 && cgroup[i]; i--) {
+ cg_destroy(cgroup[i]);
+ free(cgroup[i]);
+ }
+
+ return ret;
+}
+
+static int forkbomb_fn(const char *cgroup, void *arg)
+{
+ int ppid;
+
+ fork();
+ fork();
+
+ ppid = getppid();
+
+ while (getppid() == ppid)
+ usleep(1000);
+
+ return getppid() == ppid;
+}
+
+/*
+ * The test runs a fork bomb in a cgroup and tries to kill it.
+ */
+static int test_cgkill_forkbomb(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cgroup = NULL;
+ pid_t pid = -ESRCH;
+
+ cgroup = cg_name(root, "cg_forkbomb_test");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ pid = cg_run_nowait(cgroup, forkbomb_fn, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ usleep(100000);
+
+ if (cg_kill_wait(cgroup))
+ goto cleanup;
+
+ if (cg_wait_for_proc_count(cgroup, 0))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (pid > 0)
+ wait_for_pid(pid);
+
+ if (ret == KSFT_PASS &&
+ cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
+ ret = KSFT_FAIL;
+
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct cgkill_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_cgkill_simple),
+ T(test_cgkill_tree),
+ T(test_cgkill_forkbomb),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
new file mode 100644
index 000000000000..96693d8772be
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sys/sysinfo.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+
+/*
+ * Memory cgroup charging is performed using percpu batches 64 pages
+ * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So
+ * the maximum discrepancy between charge and vmstat entries is number
+ * of cpus multiplied by 64 pages.
+ */
+#define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
+
+
+static int alloc_dcache(const char *cgroup, void *arg)
+{
+ unsigned long i;
+ struct stat st;
+ char buf[128];
+
+ for (i = 0; i < (unsigned long)arg; i++) {
+ snprintf(buf, sizeof(buf),
+ "/something-non-existent-with-a-long-name-%64lu-%d",
+ i, getpid());
+ stat(buf, &st);
+ }
+
+ return 0;
+}
+
+/*
+ * This test allocates 100000 of negative dentries with long names.
+ * Then it checks that "slab" in memory.stat is larger than 1M.
+ * Then it sets memory.high to 1M and checks that at least 1/2
+ * of slab memory has been reclaimed.
+ */
+static int test_kmem_basic(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg = NULL;
+ long slab0, slab1, current;
+
+ cg = cg_name(root, "kmem_basic_test");
+ if (!cg)
+ goto cleanup;
+
+ if (cg_create(cg))
+ goto cleanup;
+
+ if (cg_run(cg, alloc_dcache, (void *)100000))
+ goto cleanup;
+
+ slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
+ if (slab0 < (1 << 20))
+ goto cleanup;
+
+ cg_write(cg, "memory.high", "1M");
+
+ /* wait for RCU freeing */
+ sleep(1);
+
+ slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
+ if (slab1 < 0)
+ goto cleanup;
+
+ current = cg_read_long(cg, "memory.current");
+ if (current < 0)
+ goto cleanup;
+
+ if (slab1 < slab0 / 2 && current < slab0 / 2)
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(cg);
+ free(cg);
+
+ return ret;
+}
+
+static void *alloc_kmem_fn(void *arg)
+{
+ alloc_dcache(NULL, (void *)100);
+ return NULL;
+}
+
+static int alloc_kmem_smp(const char *cgroup, void *arg)
+{
+ int nr_threads = 2 * get_nprocs();
+ pthread_t *tinfo;
+ unsigned long i;
+ int ret = -1;
+
+ tinfo = calloc(nr_threads, sizeof(pthread_t));
+ if (tinfo == NULL)
+ return -1;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
+ (void *)i)) {
+ free(tinfo);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < nr_threads; i++) {
+ ret = pthread_join(tinfo[i], NULL);
+ if (ret)
+ break;
+ }
+
+ free(tinfo);
+ return ret;
+}
+
+static int cg_run_in_subcgroups(const char *parent,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg, int times)
+{
+ char *child;
+ int i;
+
+ for (i = 0; i < times; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ if (!child)
+ return -1;
+
+ if (cg_create(child)) {
+ cg_destroy(child);
+ free(child);
+ return -1;
+ }
+
+ if (cg_run(child, fn, NULL)) {
+ cg_destroy(child);
+ free(child);
+ return -1;
+ }
+
+ cg_destroy(child);
+ free(child);
+ }
+
+ return 0;
+}
+
+/*
+ * The test creates and destroys a large number of cgroups. In each cgroup it
+ * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
+ * threads. Then it checks the sanity of numbers on the parent level:
+ * the total size of the cgroups should be roughly equal to
+ * anon + file + kernel + sock.
+ */
+static int test_kmem_memcg_deletion(const char *root)
+{
+ long current, anon, file, kernel, sock, sum;
+ int ret = KSFT_FAIL;
+ char *parent;
+
+ parent = cg_name(root, "kmem_memcg_deletion_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
+ goto cleanup;
+
+ current = cg_read_long(parent, "memory.current");
+ anon = cg_read_key_long(parent, "memory.stat", "anon ");
+ file = cg_read_key_long(parent, "memory.stat", "file ");
+ kernel = cg_read_key_long(parent, "memory.stat", "kernel ");
+ sock = cg_read_key_long(parent, "memory.stat", "sock ");
+ if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0)
+ goto cleanup;
+
+ sum = anon + file + kernel + sock;
+ if (labs(sum - current) < MAX_VMSTAT_ERROR) {
+ ret = KSFT_PASS;
+ } else {
+ printf("memory.current = %ld\n", current);
+ printf("anon + file + kernel + sock = %ld\n", sum);
+ printf("anon = %ld\n", anon);
+ printf("file = %ld\n", file);
+ printf("kernel = %ld\n", kernel);
+ printf("sock = %ld\n", sock);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * The test reads the entire /proc/kpagecgroup. If the operation went
+ * successfully (and the kernel didn't panic), the test is treated as passed.
+ */
+static int test_kmem_proc_kpagecgroup(const char *root)
+{
+ unsigned long buf[128];
+ int ret = KSFT_FAIL;
+ ssize_t len;
+ int fd;
+
+ fd = open("/proc/kpagecgroup", O_RDONLY);
+ if (fd < 0)
+ return ret;
+
+ do {
+ len = read(fd, buf, sizeof(buf));
+ } while (len > 0);
+
+ if (len == 0)
+ ret = KSFT_PASS;
+
+ close(fd);
+ return ret;
+}
+
+static void *pthread_wait_fn(void *arg)
+{
+ sleep(100);
+ return NULL;
+}
+
+static int spawn_1000_threads(const char *cgroup, void *arg)
+{
+ int nr_threads = 1000;
+ pthread_t *tinfo;
+ unsigned long i;
+ long stack;
+ int ret = -1;
+
+ tinfo = calloc(nr_threads, sizeof(pthread_t));
+ if (tinfo == NULL)
+ return -1;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
+ (void *)i)) {
+ free(tinfo);
+ return(-1);
+ }
+ }
+
+ stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
+ if (stack >= 4096 * 1000)
+ ret = 0;
+
+ free(tinfo);
+ return ret;
+}
+
+/*
+ * The test spawns a process, which spawns 1000 threads. Then it checks
+ * that memory.stat's kernel_stack is at least 1000 pages large.
+ */
+static int test_kmem_kernel_stacks(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg = NULL;
+
+ cg = cg_name(root, "kmem_kernel_stacks_test");
+ if (!cg)
+ goto cleanup;
+
+ if (cg_create(cg))
+ goto cleanup;
+
+ if (cg_run(cg, spawn_1000_threads, NULL))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(cg);
+ free(cg);
+
+ return ret;
+}
+
+/*
+ * This test sequentionally creates 30 child cgroups, allocates some
+ * kernel memory in each of them, and deletes them. Then it checks
+ * that the number of dying cgroups on the parent level is 0.
+ */
+static int test_kmem_dead_cgroups(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent;
+ long dead;
+ int i;
+
+ parent = cg_name(root, "kmem_dead_cgroups_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
+ goto cleanup;
+
+ for (i = 0; i < 5; i++) {
+ dead = cg_read_key_long(parent, "cgroup.stat",
+ "nr_dying_descendants ");
+ if (dead == 0) {
+ ret = KSFT_PASS;
+ break;
+ }
+ /*
+ * Reclaiming cgroups might take some time,
+ * let's wait a bit and repeat.
+ */
+ sleep(1);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * This test creates a sub-tree with 1000 memory cgroups.
+ * Then it checks that the memory.current on the parent level
+ * is greater than 0 and approximates matches the percpu value
+ * from memory.stat.
+ */
+static int test_percpu_basic(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent, *child;
+ long current, percpu;
+ int i;
+
+ parent = cg_name(root, "percpu_basic_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ for (i = 0; i < 1000; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ if (!child)
+ return -1;
+
+ if (cg_create(child))
+ goto cleanup_children;
+
+ free(child);
+ }
+
+ current = cg_read_long(parent, "memory.current");
+ percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
+
+ if (current > 0 && percpu > 0 && labs(current - percpu) <
+ MAX_VMSTAT_ERROR)
+ ret = KSFT_PASS;
+ else
+ printf("memory.current %ld\npercpu %ld\n",
+ current, percpu);
+
+cleanup_children:
+ for (i = 0; i < 1000; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ cg_destroy(child);
+ free(child);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct kmem_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_kmem_basic),
+ T(test_kmem_memcg_deletion),
+ T(test_kmem_proc_kpagecgroup),
+ T(test_kmem_kernel_stacks),
+ T(test_kmem_dead_cgroups),
+ T(test_percpu_basic),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that memory controller is available:
+ * memory is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+ ksft_exit_skip("memory controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index c19a97dd02d4..a680f773f2d5 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -16,10 +16,92 @@
#include <netinet/in.h>
#include <netdb.h>
#include <errno.h>
+#include <sys/mman.h>
#include "../kselftest.h"
#include "cgroup_util.h"
+static bool has_localevents;
+static bool has_recursiveprot;
+
+int get_temp_fd(void)
+{
+ return open(".", O_TMPFILE | O_RDWR | O_EXCL);
+}
+
+int alloc_pagecache(int fd, size_t size)
+{
+ char buf[PAGE_SIZE];
+ struct stat st;
+ int i;
+
+ if (fstat(fd, &st))
+ goto cleanup;
+
+ size += st.st_size;
+
+ if (ftruncate(fd, size))
+ goto cleanup;
+
+ for (i = 0; i < size; i += sizeof(buf))
+ read(fd, buf, sizeof(buf));
+
+ return 0;
+
+cleanup:
+ return -1;
+}
+
+int alloc_anon(const char *cgroup, void *arg)
+{
+ size_t size = (unsigned long)arg;
+ char *buf, *ptr;
+
+ buf = malloc(size);
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 0;
+
+ free(buf);
+ return 0;
+}
+
+int is_swap_enabled(void)
+{
+ char buf[PAGE_SIZE];
+ const char delim[] = "\n";
+ int cnt = 0;
+ char *line;
+
+ if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
+ return -1;
+
+ for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
+ cnt++;
+
+ return cnt > 1;
+}
+
+int set_oom_adj_score(int pid, int score)
+{
+ char path[PATH_MAX];
+ int fd, len;
+
+ sprintf(path, "/proc/%d/oom_score_adj", pid);
+
+ fd = open(path, O_WRONLY | O_APPEND);
+ if (fd < 0)
+ return fd;
+
+ len = dprintf(fd, "%d", score);
+ if (len < 0) {
+ close(fd);
+ return len;
+ }
+
+ close(fd);
+ return 0;
+}
+
/*
* This test creates two nested cgroups with and without enabling
* the memory controller.
@@ -94,6 +176,11 @@ static int alloc_anon_50M_check(const char *cgroup, void *arg)
int ret = -1;
buf = malloc(size);
+ if (buf == NULL) {
+ fprintf(stderr, "malloc() failed\n");
+ return -1;
+ }
+
for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
*ptr = 0;
@@ -152,13 +239,16 @@ cleanup:
/*
* This test create a memory cgroup, allocates
* some anonymous memory and some pagecache
- * and check memory.current and some memory.stat values.
+ * and checks memory.current, memory.peak, and some memory.stat values.
*/
-static int test_memcg_current(const char *root)
+static int test_memcg_current_peak(const char *root)
{
int ret = KSFT_FAIL;
- long current;
+ long current, peak, peak_reset;
char *memcg;
+ bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
+ int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
+ struct stat ss;
memcg = cg_name(root, "memcg_test");
if (!memcg)
@@ -171,28 +261,130 @@ static int test_memcg_current(const char *root)
if (current != 0)
goto cleanup;
+ peak = cg_read_long(memcg, "memory.peak");
+ if (peak != 0)
+ goto cleanup;
+
if (cg_run(memcg, alloc_anon_50M_check, NULL))
goto cleanup;
+ peak = cg_read_long(memcg, "memory.peak");
+ if (peak < MB(50))
+ goto cleanup;
+
+ /*
+ * We'll open a few FDs for the same memory.peak file to exercise the free-path
+ * We need at least three to be closed in a different order than writes occurred to test
+ * the linked-list handling.
+ */
+ peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+ if (peak_fd == -1) {
+ if (errno == ENOENT)
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ /*
+ * Before we try to use memory.peak's fd, try to figure out whether
+ * this kernel supports writing to that file in the first place. (by
+ * checking the writable bit on the file's st_mode)
+ */
+ if (fstat(peak_fd, &ss))
+ goto cleanup;
+
+ if ((ss.st_mode & S_IWUSR) == 0) {
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+ if (peak_fd2 == -1)
+ goto cleanup;
+
+ peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+ if (peak_fd3 == -1)
+ goto cleanup;
+
+ /* any non-empty string resets, but make it clear */
+ static const char reset_string[] = "reset\n";
+
+ peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ /* Make sure a completely independent read isn't affected by our FD-local reset above*/
+ peak = cg_read_long(memcg, "memory.peak");
+ if (peak < MB(50))
+ goto cleanup;
+
+ fd2_closed = true;
+ if (close(peak_fd2))
+ goto cleanup;
+
+ peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+ if (peak_fd4 == -1)
+ goto cleanup;
+
+ peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ peak = cg_read_long_fd(peak_fd);
+ if (peak > MB(30) || peak < 0)
+ goto cleanup;
+
if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
goto cleanup;
+ peak = cg_read_long(memcg, "memory.peak");
+ if (peak < MB(50))
+ goto cleanup;
+
+ /* Make sure everything is back to normal */
+ peak = cg_read_long_fd(peak_fd);
+ if (peak < MB(50))
+ goto cleanup;
+
+ peak = cg_read_long_fd(peak_fd4);
+ if (peak < MB(50))
+ goto cleanup;
+
+ fd3_closed = true;
+ if (close(peak_fd3))
+ goto cleanup;
+
+ fd4_closed = true;
+ if (close(peak_fd4))
+ goto cleanup;
+
ret = KSFT_PASS;
cleanup:
+ close(peak_fd);
+ if (!fd2_closed)
+ close(peak_fd2);
+ if (!fd3_closed)
+ close(peak_fd3);
+ if (!fd4_closed)
+ close(peak_fd4);
cg_destroy(memcg);
free(memcg);
return ret;
}
-static int alloc_pagecache_50M(const char *cgroup, void *arg)
-{
- int fd = (long)arg;
-
- return alloc_pagecache(fd, MB(50));
-}
-
static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
{
int fd = (long)arg;
@@ -210,13 +402,22 @@ static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
static int alloc_anon_noexit(const char *cgroup, void *arg)
{
int ppid = getppid();
+ size_t size = (unsigned long)arg;
+ char *buf, *ptr;
- if (alloc_anon(cgroup, arg))
+ buf = malloc(size);
+ if (buf == NULL) {
+ fprintf(stderr, "malloc() failed\n");
return -1;
+ }
+
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 0;
while (getppid() == ppid)
sleep(1);
+ free(buf);
return 0;
}
@@ -237,36 +438,52 @@ static int cg_test_proc_killed(const char *cgroup)
return -1;
}
+static bool reclaim_until(const char *memcg, long goal);
+
/*
* First, this test creates the following hierarchy:
- * A memory.min = 50M, memory.max = 200M
- * A/B memory.min = 50M, memory.current = 50M
+ * A memory.min = 0, memory.max = 200M
+ * A/B memory.min = 50M
* A/B/C memory.min = 75M, memory.current = 50M
* A/B/D memory.min = 25M, memory.current = 50M
- * A/B/E memory.min = 500M, memory.current = 0
- * A/B/F memory.min = 0, memory.current = 50M
+ * A/B/E memory.min = 0, memory.current = 50M
+ * A/B/F memory.min = 500M, memory.current = 0
*
- * Usages are pagecache, but the test keeps a running
+ * (or memory.low if we test soft protection)
+ *
+ * Usages are pagecache and the test keeps a running
* process in every leaf cgroup.
* Then it creates A/G and creates a significant
- * memory pressure in it.
+ * memory pressure in A.
*
+ * Then it checks actual memory usages and expects that:
* A/B memory.current ~= 50M
- * A/B/C memory.current ~= 33M
- * A/B/D memory.current ~= 17M
- * A/B/E memory.current ~= 0
+ * A/B/C memory.current ~= 29M [memory.events:low > 0]
+ * A/B/D memory.current ~= 21M [memory.events:low > 0]
+ * A/B/E memory.current ~= 0 [memory.events:low == 0 if !memory_recursiveprot,
+ * undefined otherwise]
+ * A/B/F memory.current = 0 [memory.events:low == 0]
+ * (for origin of the numbers, see model in memcg_protection.m.)
*
* After that it tries to allocate more than there is
- * unprotected memory in A available, and checks
- * checks that memory.min protects pagecache even
- * in this case.
+ * unprotected memory in A available, and checks that:
+ * a) memory.min protects pagecache even in this case,
+ * b) memory.low allows reclaiming page cache with low events.
+ *
+ * Then we try to reclaim from A/B/C using memory.reclaim until its
+ * usage reaches 10M.
+ * This makes sure that:
+ * (a) We ignore the protection of the reclaim target memcg.
+ * (b) The previously calculated emin value (~29M) should be dismissed.
*/
-static int test_memcg_min(const char *root)
+static int test_memcg_protection(const char *root, bool min)
{
- int ret = KSFT_FAIL;
+ int ret = KSFT_FAIL, rc;
char *parent[3] = {NULL};
char *children[4] = {NULL};
+ const char *attribute = min ? "memory.min" : "memory.low";
long c[4];
+ long current;
int i, attempts;
int fd;
@@ -289,8 +506,10 @@ static int test_memcg_min(const char *root)
if (cg_create(parent[0]))
goto cleanup;
- if (cg_read_long(parent[0], "memory.min")) {
- ret = KSFT_SKIP;
+ if (cg_read_long(parent[0], attribute)) {
+ /* No memory.min on older kernels is fine */
+ if (min)
+ ret = KSFT_SKIP;
goto cleanup;
}
@@ -320,24 +539,22 @@ static int test_memcg_min(const char *root)
if (cg_create(children[i]))
goto cleanup;
- if (i == 2)
+ if (i > 2)
continue;
cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
(void *)(long)fd);
}
- if (cg_write(parent[0], "memory.min", "50M"))
+ if (cg_write(parent[1], attribute, "50M"))
goto cleanup;
- if (cg_write(parent[1], "memory.min", "50M"))
+ if (cg_write(children[0], attribute, "75M"))
goto cleanup;
- if (cg_write(children[0], "memory.min", "75M"))
+ if (cg_write(children[1], attribute, "25M"))
goto cleanup;
- if (cg_write(children[1], "memory.min", "25M"))
+ if (cg_write(children[2], attribute, "0"))
goto cleanup;
- if (cg_write(children[2], "memory.min", "500M"))
- goto cleanup;
- if (cg_write(children[3], "memory.min", "0"))
+ if (cg_write(children[3], attribute, "500M"))
goto cleanup;
attempts = 0;
@@ -357,178 +574,59 @@ static int test_memcg_min(const char *root)
for (i = 0; i < ARRAY_SIZE(children); i++)
c[i] = cg_read_long(children[i], "memory.current");
- if (!values_close(c[0], MB(33), 10))
+ if (!values_close(c[0], MB(29), 15))
goto cleanup;
- if (!values_close(c[1], MB(17), 10))
+ if (!values_close(c[1], MB(21), 20))
goto cleanup;
- if (!values_close(c[2], 0, 1))
+ if (c[3] != 0)
goto cleanup;
- if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
+ rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
+ if (min && !rc)
goto cleanup;
-
- if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
+ else if (!min && rc) {
+ fprintf(stderr,
+ "memory.low prevents from allocating anon memory\n");
goto cleanup;
-
- ret = KSFT_PASS;
-
-cleanup:
- for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
- if (!children[i])
- continue;
-
- cg_destroy(children[i]);
- free(children[i]);
- }
-
- for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
- if (!parent[i])
- continue;
-
- cg_destroy(parent[i]);
- free(parent[i]);
}
- close(fd);
- return ret;
-}
-/*
- * First, this test creates the following hierarchy:
- * A memory.low = 50M, memory.max = 200M
- * A/B memory.low = 50M, memory.current = 50M
- * A/B/C memory.low = 75M, memory.current = 50M
- * A/B/D memory.low = 25M, memory.current = 50M
- * A/B/E memory.low = 500M, memory.current = 0
- * A/B/F memory.low = 0, memory.current = 50M
- *
- * Usages are pagecache.
- * Then it creates A/G an creates a significant
- * memory pressure in it.
- *
- * Then it checks actual memory usages and expects that:
- * A/B memory.current ~= 50M
- * A/B/ memory.current ~= 33M
- * A/B/D memory.current ~= 17M
- * A/B/E memory.current ~= 0
- *
- * After that it tries to allocate more than there is
- * unprotected memory in A available,
- * and checks low and oom events in memory.events.
- */
-static int test_memcg_low(const char *root)
-{
- int ret = KSFT_FAIL;
- char *parent[3] = {NULL};
- char *children[4] = {NULL};
- long low, oom;
- long c[4];
- int i;
- int fd;
-
- fd = get_temp_fd();
- if (fd < 0)
- goto cleanup;
-
- parent[0] = cg_name(root, "memcg_test_0");
- if (!parent[0])
- goto cleanup;
-
- parent[1] = cg_name(parent[0], "memcg_test_1");
- if (!parent[1])
- goto cleanup;
-
- parent[2] = cg_name(parent[0], "memcg_test_2");
- if (!parent[2])
- goto cleanup;
-
- if (cg_create(parent[0]))
- goto cleanup;
-
- if (cg_read_long(parent[0], "memory.low"))
+ current = min ? MB(50) : MB(30);
+ if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
goto cleanup;
- if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
+ if (!reclaim_until(children[0], MB(10)))
goto cleanup;
- if (cg_write(parent[0], "memory.max", "200M"))
- goto cleanup;
-
- if (cg_write(parent[0], "memory.swap.max", "0"))
- goto cleanup;
-
- if (cg_create(parent[1]))
- goto cleanup;
-
- if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
- goto cleanup;
-
- if (cg_create(parent[2]))
- goto cleanup;
-
- for (i = 0; i < ARRAY_SIZE(children); i++) {
- children[i] = cg_name_indexed(parent[1], "child_memcg", i);
- if (!children[i])
- goto cleanup;
-
- if (cg_create(children[i]))
- goto cleanup;
-
- if (i == 2)
- continue;
-
- if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
- goto cleanup;
- }
-
- if (cg_write(parent[0], "memory.low", "50M"))
- goto cleanup;
- if (cg_write(parent[1], "memory.low", "50M"))
- goto cleanup;
- if (cg_write(children[0], "memory.low", "75M"))
- goto cleanup;
- if (cg_write(children[1], "memory.low", "25M"))
- goto cleanup;
- if (cg_write(children[2], "memory.low", "500M"))
- goto cleanup;
- if (cg_write(children[3], "memory.low", "0"))
- goto cleanup;
-
- if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
- goto cleanup;
-
- if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
- goto cleanup;
-
- for (i = 0; i < ARRAY_SIZE(children); i++)
- c[i] = cg_read_long(children[i], "memory.current");
-
- if (!values_close(c[0], MB(33), 10))
- goto cleanup;
-
- if (!values_close(c[1], MB(17), 10))
- goto cleanup;
-
- if (!values_close(c[2], 0, 1))
- goto cleanup;
-
- if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
- fprintf(stderr,
- "memory.low prevents from allocating anon memory\n");
+ if (min) {
+ ret = KSFT_PASS;
goto cleanup;
}
+ /*
+ * Child 2 has memory.low=0, but some low protection may still be
+ * distributed down from its parent with memory.low=50M if cgroup2
+ * memory_recursiveprot mount option is enabled. Ignore the low
+ * event count in this case.
+ */
for (i = 0; i < ARRAY_SIZE(children); i++) {
+ int ignore_low_events_index = has_recursiveprot ? 2 : -1;
+ int no_low_events_index = 1;
+ long low, oom;
+
oom = cg_read_key_long(children[i], "memory.events", "oom ");
low = cg_read_key_long(children[i], "memory.events", "low ");
if (oom)
goto cleanup;
- if (i < 2 && low <= 0)
+ if (i == ignore_low_events_index)
+ continue;
+ if (i <= no_low_events_index && low <= 0)
goto cleanup;
- if (i >= 2 && low)
+ if (i > no_low_events_index && low)
goto cleanup;
+
}
ret = KSFT_PASS;
@@ -553,13 +651,28 @@ cleanup:
return ret;
}
+static int test_memcg_min(const char *root)
+{
+ return test_memcg_protection(root, true);
+}
+
+static int test_memcg_low(const char *root)
+{
+ return test_memcg_protection(root, false);
+}
+
static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
{
size_t size = MB(50);
int ret = -1;
- long current;
+ long current, high, max;
int fd;
+ high = cg_read_long(cgroup, "memory.high");
+ max = cg_read_long(cgroup, "memory.max");
+ if (high != MB(30) && max != MB(30))
+ return -1;
+
fd = get_temp_fd();
if (fd < 0)
return -1;
@@ -568,7 +681,7 @@ static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
goto cleanup;
current = cg_read_long(cgroup, "memory.current");
- if (current <= MB(29) || current > MB(30))
+ if (!values_close(current, MB(30), 5))
goto cleanup;
ret = 0;
@@ -606,7 +719,7 @@ static int test_memcg_high(const char *root)
if (cg_write(memcg, "memory.high", "30M"))
goto cleanup;
- if (cg_run(memcg, alloc_anon, (void *)MB(100)))
+ if (cg_run(memcg, alloc_anon, (void *)MB(31)))
goto cleanup;
if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
@@ -628,6 +741,82 @@ cleanup:
return ret;
}
+static int alloc_anon_mlock(const char *cgroup, void *arg)
+{
+ size_t size = (size_t)arg;
+ void *buf;
+
+ buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+ 0, 0);
+ if (buf == MAP_FAILED)
+ return -1;
+
+ mlock(buf, size);
+ munmap(buf, size);
+ return 0;
+}
+
+/*
+ * This test checks that memory.high is able to throttle big single shot
+ * allocation i.e. large allocation within one kernel entry.
+ */
+static int test_memcg_high_sync(const char *root)
+{
+ int ret = KSFT_FAIL, pid, fd = -1;
+ char *memcg;
+ long pre_high, pre_max;
+ long post_high, post_max;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ pre_high = cg_read_key_long(memcg, "memory.events", "high ");
+ pre_max = cg_read_key_long(memcg, "memory.events", "max ");
+ if (pre_high < 0 || pre_max < 0)
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.high", "30M"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.max", "140M"))
+ goto cleanup;
+
+ fd = memcg_prepare_for_wait(memcg);
+ if (fd < 0)
+ goto cleanup;
+
+ pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
+ if (pid < 0)
+ goto cleanup;
+
+ cg_wait_for(fd);
+
+ post_high = cg_read_key_long(memcg, "memory.events", "high ");
+ post_max = cg_read_key_long(memcg, "memory.events", "max ");
+ if (post_high < 0 || post_max < 0)
+ goto cleanup;
+
+ if (pre_high == post_high || pre_max != post_max)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (fd >= 0)
+ close(fd);
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
/*
* This test checks that memory.max limits the amount of
* memory which can be consumed by either anonymous memory
@@ -679,6 +868,121 @@ cleanup:
return ret;
}
+/*
+ * Reclaim from @memcg until usage reaches @goal by writing to
+ * memory.reclaim.
+ *
+ * This function will return false if the usage is already below the
+ * goal.
+ *
+ * This function assumes that writing to memory.reclaim is the only
+ * source of change in memory.current (no concurrent allocations or
+ * reclaim).
+ *
+ * This function makes sure memory.reclaim is sane. It will return
+ * false if memory.reclaim's error codes do not make sense, even if
+ * the usage goal was satisfied.
+ */
+static bool reclaim_until(const char *memcg, long goal)
+{
+ char buf[64];
+ int retries, err;
+ long current, to_reclaim;
+ bool reclaimed = false;
+
+ for (retries = 5; retries > 0; retries--) {
+ current = cg_read_long(memcg, "memory.current");
+
+ if (current < goal || values_close(current, goal, 3))
+ break;
+ /* Did memory.reclaim return 0 incorrectly? */
+ else if (reclaimed)
+ return false;
+
+ to_reclaim = current - goal;
+ snprintf(buf, sizeof(buf), "%ld", to_reclaim);
+ err = cg_write(memcg, "memory.reclaim", buf);
+ if (!err)
+ reclaimed = true;
+ else if (err != -EAGAIN)
+ return false;
+ }
+ return reclaimed;
+}
+
+/*
+ * This test checks that memory.reclaim reclaims the given
+ * amount of memory (from both anon and file, if possible).
+ */
+static int test_memcg_reclaim(const char *root)
+{
+ int ret = KSFT_FAIL;
+ int fd = -1;
+ int retries;
+ char *memcg;
+ long current, expected_usage;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ current = cg_read_long(memcg, "memory.current");
+ if (current != 0)
+ goto cleanup;
+
+ fd = get_temp_fd();
+ if (fd < 0)
+ goto cleanup;
+
+ cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
+
+ /*
+ * If swap is enabled, try to reclaim from both anon and file, else try
+ * to reclaim from file only.
+ */
+ if (is_swap_enabled()) {
+ cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
+ expected_usage = MB(100);
+ } else
+ expected_usage = MB(50);
+
+ /*
+ * Wait until current usage reaches the expected usage (or we run out of
+ * retries).
+ */
+ retries = 5;
+ while (!values_close(cg_read_long(memcg, "memory.current"),
+ expected_usage, 10)) {
+ if (retries--) {
+ sleep(1);
+ continue;
+ } else {
+ fprintf(stderr,
+ "failed to allocate %ld for memcg reclaim test\n",
+ expected_usage);
+ goto cleanup;
+ }
+ }
+
+ /*
+ * Reclaim until current reaches 30M, this makes sure we hit both anon
+ * and file if swap is enabled.
+ */
+ if (!reclaim_until(memcg, MB(30)))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+ close(fd);
+
+ return ret;
+}
+
static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
{
long mem_max = (long)arg;
@@ -688,6 +992,11 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
int ret = -1;
buf = malloc(size);
+ if (buf == NULL) {
+ fprintf(stderr, "malloc() failed\n");
+ return -1;
+ }
+
for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
*ptr = 0;
@@ -708,13 +1017,19 @@ cleanup:
/*
* This test checks that memory.swap.max limits the amount of
- * anonymous memory which can be swapped out.
+ * anonymous memory which can be swapped out. Additionally, it verifies that
+ * memory.swap.peak reflects the high watermark and can be reset.
*/
-static int test_memcg_swap_max(const char *root)
+static int test_memcg_swap_max_peak(const char *root)
{
int ret = KSFT_FAIL;
char *memcg;
- long max;
+ long max, peak;
+ struct stat ss;
+ int swap_peak_fd = -1, mem_peak_fd = -1;
+
+ /* any non-empty string resets */
+ static const char reset_string[] = "foobarbaz";
if (!is_swap_enabled())
return KSFT_SKIP;
@@ -731,6 +1046,61 @@ static int test_memcg_swap_max(const char *root)
goto cleanup;
}
+ swap_peak_fd = cg_open(memcg, "memory.swap.peak",
+ O_RDWR | O_APPEND | O_CLOEXEC);
+
+ if (swap_peak_fd == -1) {
+ if (errno == ENOENT)
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ /*
+ * Before we try to use memory.swap.peak's fd, try to figure out
+ * whether this kernel supports writing to that file in the first
+ * place. (by checking the writable bit on the file's st_mode)
+ */
+ if (fstat(swap_peak_fd, &ss))
+ goto cleanup;
+
+ if ((ss.st_mode & S_IWUSR) == 0) {
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+ if (mem_peak_fd == -1)
+ goto cleanup;
+
+ if (cg_read_long(memcg, "memory.swap.peak"))
+ goto cleanup;
+
+ if (cg_read_long_fd(swap_peak_fd))
+ goto cleanup;
+
+ /* switch the swap and mem fds into local-peak tracking mode*/
+ int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
+
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ if (cg_read_long_fd(swap_peak_fd))
+ goto cleanup;
+
+ if (cg_read_long(memcg, "memory.peak"))
+ goto cleanup;
+
+ if (cg_read_long_fd(mem_peak_fd))
+ goto cleanup;
+
+ peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ if (cg_read_long_fd(mem_peak_fd))
+ goto cleanup;
+
if (cg_read_strcmp(memcg, "memory.max", "max\n"))
goto cleanup;
@@ -753,6 +1123,61 @@ static int test_memcg_swap_max(const char *root)
if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
goto cleanup;
+ peak = cg_read_long(memcg, "memory.peak");
+ if (peak < MB(29))
+ goto cleanup;
+
+ peak = cg_read_long(memcg, "memory.swap.peak");
+ if (peak < MB(29))
+ goto cleanup;
+
+ peak = cg_read_long_fd(mem_peak_fd);
+ if (peak < MB(29))
+ goto cleanup;
+
+ peak = cg_read_long_fd(swap_peak_fd);
+ if (peak < MB(29))
+ goto cleanup;
+
+ /*
+ * open, reset and close the peak swap on another FD to make sure
+ * multiple extant fds don't corrupt the linked-list
+ */
+ peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
+ if (peak_reset)
+ goto cleanup;
+
+ peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
+ if (peak_reset)
+ goto cleanup;
+
+ /* actually reset on the fds */
+ peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
+ if (peak_reset != sizeof(reset_string))
+ goto cleanup;
+
+ peak = cg_read_long_fd(swap_peak_fd);
+ if (peak > MB(10))
+ goto cleanup;
+
+ /*
+ * The cgroup is now empty, but there may be a page or two associated
+ * with the open FD accounted to it.
+ */
+ peak = cg_read_long_fd(mem_peak_fd);
+ if (peak > MB(1))
+ goto cleanup;
+
+ if (cg_read_long(memcg, "memory.peak") < MB(29))
+ goto cleanup;
+
+ if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
+ goto cleanup;
+
if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
goto cleanup;
@@ -760,9 +1185,29 @@ static int test_memcg_swap_max(const char *root)
if (max <= 0)
goto cleanup;
+ peak = cg_read_long(memcg, "memory.peak");
+ if (peak < MB(29))
+ goto cleanup;
+
+ peak = cg_read_long(memcg, "memory.swap.peak");
+ if (peak < MB(29))
+ goto cleanup;
+
+ peak = cg_read_long_fd(mem_peak_fd);
+ if (peak < MB(29))
+ goto cleanup;
+
+ peak = cg_read_long_fd(swap_peak_fd);
+ if (peak < MB(19))
+ goto cleanup;
+
ret = KSFT_PASS;
cleanup:
+ if (mem_peak_fd != -1 && close(mem_peak_fd))
+ ret = KSFT_FAIL;
+ if (swap_peak_fd != -1 && close(swap_peak_fd))
+ ret = KSFT_FAIL;
cg_destroy(memcg);
free(memcg);
@@ -882,7 +1327,9 @@ static int tcp_client(const char *cgroup, unsigned short port)
char servport[6];
int retries = 0x10; /* nice round number */
int sk, ret;
+ long allocated;
+ allocated = cg_read_long(cgroup, "memory.current");
snprintf(servport, sizeof(servport), "%hd", port);
ret = getaddrinfo(server, servport, NULL, &ai);
if (ret)
@@ -910,10 +1357,8 @@ static int tcp_client(const char *cgroup, unsigned short port)
if (current < 0 || sock < 0)
goto close_sk;
- if (current < sock)
- goto close_sk;
-
- if (values_close(current, sock, 10)) {
+ /* exclude the memory not related to socket connection */
+ if (values_close(current - allocated, sock, 10)) {
ret = KSFT_PASS;
break;
}
@@ -1002,12 +1447,14 @@ cleanup:
/*
* This test disables swapping and tries to allocate anonymous memory
* up to OOM with memory.group.oom set. Then it checks that all
- * processes in the leaf (but not the parent) were killed.
+ * processes in the leaf were killed. It also checks that oom_events
+ * were propagated to the parent level.
*/
static int test_memcg_oom_group_leaf_events(const char *root)
{
int ret = KSFT_FAIL;
char *parent, *child;
+ long parent_oom_events;
parent = cg_name(root, "memcg_test_0");
child = cg_name(root, "memcg_test_0/memcg_test_1");
@@ -1045,7 +1492,16 @@ static int test_memcg_oom_group_leaf_events(const char *root)
if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
goto cleanup;
- if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0)
+ parent_oom_events = cg_read_key_long(
+ parent, "memory.events", "oom_kill ");
+ /*
+ * If memory_localevents is not enabled (the default), the parent should
+ * count OOM events in its children groups. Otherwise, it should not
+ * have observed any events.
+ */
+ if (has_localevents && parent_oom_events != 0)
+ goto cleanup;
+ else if (!has_localevents && parent_oom_events <= 0)
goto cleanup;
ret = KSFT_PASS;
@@ -1169,20 +1625,21 @@ cleanup:
return ret;
}
-
#define T(x) { x, #x }
struct memcg_test {
int (*fn)(const char *root);
const char *name;
} tests[] = {
T(test_memcg_subtree_control),
- T(test_memcg_current),
+ T(test_memcg_current_peak),
T(test_memcg_min),
T(test_memcg_low),
T(test_memcg_high),
+ T(test_memcg_high_sync),
T(test_memcg_max),
+ T(test_memcg_reclaim),
T(test_memcg_oom_events),
- T(test_memcg_swap_max),
+ T(test_memcg_swap_max_peak),
T(test_memcg_sock),
T(test_memcg_oom_group_leaf_events),
T(test_memcg_oom_group_parent_events),
@@ -1193,9 +1650,9 @@ struct memcg_test {
int main(int argc, char **argv)
{
char root[PATH_MAX];
- int i, ret = EXIT_SUCCESS;
+ int i, proc_status, ret = EXIT_SUCCESS;
- if (cg_find_unified_root(root, sizeof(root)))
+ if (cg_find_unified_root(root, sizeof(root), NULL))
ksft_exit_skip("cgroup v2 isn't mounted\n");
/*
@@ -1209,6 +1666,16 @@ int main(int argc, char **argv)
if (cg_write(root, "cgroup.subtree_control", "+memory"))
ksft_exit_skip("Failed to set memory controller\n");
+ proc_status = proc_mount_contains("memory_recursiveprot");
+ if (proc_status < 0)
+ ksft_exit_skip("Failed to query cgroup mount option\n");
+ has_recursiveprot = proc_status;
+
+ proc_status = proc_mount_contains("memory_localevents");
+ if (proc_status < 0)
+ ksft_exit_skip("Failed to query cgroup mount option\n");
+ has_localevents = proc_status;
+
for (i = 0; i < ARRAY_SIZE(tests); i++) {
switch (tests[i].fn(root)) {
case KSFT_PASS:
diff --git a/tools/testing/selftests/cgroup/test_pids.c b/tools/testing/selftests/cgroup/test_pids.c
new file mode 100644
index 000000000000..9ecb83c6cc5c
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_pids.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <linux/limits.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int run_success(const char *cgroup, void *arg)
+{
+ return 0;
+}
+
+static int run_pause(const char *cgroup, void *arg)
+{
+ return pause();
+}
+
+/*
+ * This test checks that pids.max prevents forking new children above the
+ * specified limit in the cgroup.
+ */
+static int test_pids_max(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_pids;
+ int pid;
+
+ cg_pids = cg_name(root, "pids_test");
+ if (!cg_pids)
+ goto cleanup;
+
+ if (cg_create(cg_pids))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_pids, "pids.max", "max\n"))
+ goto cleanup;
+
+ if (cg_write(cg_pids, "pids.max", "2"))
+ goto cleanup;
+
+ if (cg_enter_current(cg_pids))
+ goto cleanup;
+
+ pid = cg_run_nowait(cg_pids, run_pause, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_run_nowait(cg_pids, run_success, NULL) != -1 || errno != EAGAIN)
+ goto cleanup;
+
+ if (kill(pid, SIGINT))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ cg_destroy(cg_pids);
+ free(cg_pids);
+
+ return ret;
+}
+
+/*
+ * This test checks that pids.events are counted in cgroup associated with pids.max
+ */
+static int test_pids_events(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_parent = NULL, *cg_child = NULL;
+ int pid;
+
+ cg_parent = cg_name(root, "pids_parent");
+ cg_child = cg_name(cg_parent, "pids_child");
+ if (!cg_parent || !cg_child)
+ goto cleanup;
+
+ if (cg_create(cg_parent))
+ goto cleanup;
+ if (cg_write(cg_parent, "cgroup.subtree_control", "+pids"))
+ goto cleanup;
+ if (cg_create(cg_child))
+ goto cleanup;
+
+ if (cg_write(cg_parent, "pids.max", "2"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_child, "pids.max", "max\n"))
+ goto cleanup;
+
+ if (cg_enter_current(cg_child))
+ goto cleanup;
+
+ pid = cg_run_nowait(cg_child, run_pause, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_run_nowait(cg_child, run_success, NULL) != -1 || errno != EAGAIN)
+ goto cleanup;
+
+ if (kill(pid, SIGINT))
+ goto cleanup;
+
+ if (cg_read_key_long(cg_child, "pids.events", "max ") != 0)
+ goto cleanup;
+ if (cg_read_key_long(cg_parent, "pids.events", "max ") != 1)
+ goto cleanup;
+
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (cg_child)
+ cg_destroy(cg_child);
+ if (cg_parent)
+ cg_destroy(cg_parent);
+ free(cg_child);
+ free(cg_parent);
+
+ return ret;
+}
+
+
+
+#define T(x) { x, #x }
+struct pids_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_pids_max),
+ T(test_pids_events),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that pids controller is available:
+ * pids is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "pids"))
+ ksft_exit_skip("pids controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "pids"))
+ if (cg_write(root, "cgroup.subtree_control", "+pids"))
+ ksft_exit_skip("Failed to set pids controller\n");
+
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/cgroup/test_stress.sh b/tools/testing/selftests/cgroup/test_stress.sh
index 15d9d5896394..3c9c4554d5f6 100755
--- a/tools/testing/selftests/cgroup/test_stress.sh
+++ b/tools/testing/selftests/cgroup/test_stress.sh
@@ -1,4 +1,4 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
-./with_stress.sh -s subsys -s fork ./test_core
+./with_stress.sh -s subsys -s fork ${OUTPUT:-.}/test_core
diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c
new file mode 100644
index 000000000000..40de679248b8
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -0,0 +1,635 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/sysinfo.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int read_int(const char *path, size_t *value)
+{
+ FILE *file;
+ int ret = 0;
+
+ file = fopen(path, "r");
+ if (!file)
+ return -1;
+ if (fscanf(file, "%ld", value) != 1)
+ ret = -1;
+ fclose(file);
+ return ret;
+}
+
+static int set_min_free_kb(size_t value)
+{
+ FILE *file;
+ int ret;
+
+ file = fopen("/proc/sys/vm/min_free_kbytes", "w");
+ if (!file)
+ return -1;
+ ret = fprintf(file, "%ld\n", value);
+ fclose(file);
+ return ret;
+}
+
+static int read_min_free_kb(size_t *value)
+{
+ return read_int("/proc/sys/vm/min_free_kbytes", value);
+}
+
+static int get_zswap_stored_pages(size_t *value)
+{
+ return read_int("/sys/kernel/debug/zswap/stored_pages", value);
+}
+
+static long get_cg_wb_count(const char *cg)
+{
+ return cg_read_key_long(cg, "memory.stat", "zswpwb");
+}
+
+static long get_zswpout(const char *cgroup)
+{
+ return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
+}
+
+static int allocate_and_read_bytes(const char *cgroup, void *arg)
+{
+ size_t size = (size_t)arg;
+ char *mem = (char *)malloc(size);
+ int ret = 0;
+
+ if (!mem)
+ return -1;
+ for (int i = 0; i < size; i += 4095)
+ mem[i] = 'a';
+
+ /* Go through the allocated memory to (z)swap in and out pages */
+ for (int i = 0; i < size; i += 4095) {
+ if (mem[i] != 'a')
+ ret = -1;
+ }
+
+ free(mem);
+ return ret;
+}
+
+static int allocate_bytes(const char *cgroup, void *arg)
+{
+ size_t size = (size_t)arg;
+ char *mem = (char *)malloc(size);
+
+ if (!mem)
+ return -1;
+ for (int i = 0; i < size; i += 4095)
+ mem[i] = 'a';
+ free(mem);
+ return 0;
+}
+
+static char *setup_test_group_1M(const char *root, const char *name)
+{
+ char *group_name = cg_name(root, name);
+
+ if (!group_name)
+ return NULL;
+ if (cg_create(group_name))
+ goto fail;
+ if (cg_write(group_name, "memory.max", "1M")) {
+ cg_destroy(group_name);
+ goto fail;
+ }
+ return group_name;
+fail:
+ free(group_name);
+ return NULL;
+}
+
+/*
+ * Sanity test to check that pages are written into zswap.
+ */
+static int test_zswap_usage(const char *root)
+{
+ long zswpout_before, zswpout_after;
+ int ret = KSFT_FAIL;
+ char *test_group;
+
+ test_group = cg_name(root, "no_shrink_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.max", "1M"))
+ goto out;
+
+ zswpout_before = get_zswpout(test_group);
+ if (zswpout_before < 0) {
+ ksft_print_msg("Failed to get zswpout\n");
+ goto out;
+ }
+
+ /* Allocate more than memory.max to push memory into zswap */
+ if (cg_run(test_group, allocate_bytes, (void *)MB(4)))
+ goto out;
+
+ /* Verify that pages come into zswap */
+ zswpout_after = get_zswpout(test_group);
+ if (zswpout_after <= zswpout_before) {
+ ksft_print_msg("zswpout does not increase after test program\n");
+ goto out;
+ }
+ ret = KSFT_PASS;
+
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+/*
+ * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
+ * the cgroup.
+ */
+static int test_swapin_nozswap(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *test_group;
+ long swap_peak, zswpout;
+
+ test_group = cg_name(root, "no_zswap_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.max", "8M"))
+ goto out;
+ if (cg_write(test_group, "memory.zswap.max", "0"))
+ goto out;
+
+ /* Allocate and read more than memory.max to trigger swapin */
+ if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+ goto out;
+
+ /* Verify that pages are swapped out, but no zswap happened */
+ swap_peak = cg_read_long(test_group, "memory.swap.peak");
+ if (swap_peak < 0) {
+ ksft_print_msg("failed to get cgroup's swap_peak\n");
+ goto out;
+ }
+
+ if (swap_peak < MB(24)) {
+ ksft_print_msg("at least 24MB of memory should be swapped out\n");
+ goto out;
+ }
+
+ zswpout = get_zswpout(test_group);
+ if (zswpout < 0) {
+ ksft_print_msg("failed to get zswpout\n");
+ goto out;
+ }
+
+ if (zswpout > 0) {
+ ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
+ goto out;
+ }
+
+ ret = KSFT_PASS;
+
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+/* Simple test to verify the (z)swapin code paths */
+static int test_zswapin(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *test_group;
+ long zswpin;
+
+ test_group = cg_name(root, "zswapin_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.max", "8M"))
+ goto out;
+ if (cg_write(test_group, "memory.zswap.max", "max"))
+ goto out;
+
+ /* Allocate and read more than memory.max to trigger (z)swap in */
+ if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
+ goto out;
+
+ zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
+ if (zswpin < 0) {
+ ksft_print_msg("failed to get zswpin\n");
+ goto out;
+ }
+
+ if (zswpin < MB(24) / PAGE_SIZE) {
+ ksft_print_msg("at least 24MB should be brought back from zswap\n");
+ goto out;
+ }
+
+ ret = KSFT_PASS;
+
+out:
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+/*
+ * Attempt writeback with the following steps:
+ * 1. Allocate memory.
+ * 2. Reclaim memory equal to the amount that was allocated in step 1.
+ This will move it into zswap.
+ * 3. Save current zswap usage.
+ * 4. Move the memory allocated in step 1 back in from zswap.
+ * 5. Set zswap.max to half the amount that was recorded in step 3.
+ * 6. Attempt to reclaim memory equal to the amount that was allocated,
+ this will either trigger writeback if it's enabled, or reclamation
+ will fail if writeback is disabled as there isn't enough zswap space.
+ */
+static int attempt_writeback(const char *cgroup, void *arg)
+{
+ long pagesize = sysconf(_SC_PAGESIZE);
+ size_t memsize = MB(4);
+ char buf[pagesize];
+ long zswap_usage;
+ bool wb_enabled = *(bool *) arg;
+ int ret = -1;
+ char *mem;
+
+ mem = (char *)malloc(memsize);
+ if (!mem)
+ return ret;
+
+ /*
+ * Fill half of each page with increasing data, and keep other
+ * half empty, this will result in data that is still compressible
+ * and ends up in zswap, with material zswap usage.
+ */
+ for (int i = 0; i < pagesize; i++)
+ buf[i] = i < pagesize/2 ? (char) i : 0;
+
+ for (int i = 0; i < memsize; i += pagesize)
+ memcpy(&mem[i], buf, pagesize);
+
+ /* Try and reclaim allocated memory */
+ if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
+ ksft_print_msg("Failed to reclaim all of the requested memory\n");
+ goto out;
+ }
+
+ zswap_usage = cg_read_long(cgroup, "memory.zswap.current");
+
+ /* zswpin */
+ for (int i = 0; i < memsize; i += pagesize) {
+ if (memcmp(&mem[i], buf, pagesize)) {
+ ksft_print_msg("invalid memory\n");
+ goto out;
+ }
+ }
+
+ if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2))
+ goto out;
+
+ /*
+ * If writeback is enabled, trying to reclaim memory now will trigger a
+ * writeback as zswap.max is half of what was needed when reclaim ran the first time.
+ * If writeback is disabled, memory reclaim will fail as zswap is limited and
+ * it can't writeback to swap.
+ */
+ ret = cg_write_numeric(cgroup, "memory.reclaim", memsize);
+ if (!wb_enabled)
+ ret = (ret == -EAGAIN) ? 0 : -1;
+
+out:
+ free(mem);
+ return ret;
+}
+
+static int test_zswap_writeback_one(const char *cgroup, bool wb)
+{
+ long zswpwb_before, zswpwb_after;
+
+ zswpwb_before = get_cg_wb_count(cgroup);
+ if (zswpwb_before != 0) {
+ ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before);
+ return -1;
+ }
+
+ if (cg_run(cgroup, attempt_writeback, (void *) &wb))
+ return -1;
+
+ /* Verify that zswap writeback occurred only if writeback was enabled */
+ zswpwb_after = get_cg_wb_count(cgroup);
+ if (zswpwb_after < 0)
+ return -1;
+
+ if (wb != !!zswpwb_after) {
+ ksft_print_msg("zswpwb_after is %ld while wb is %s",
+ zswpwb_after, wb ? "enabled" : "disabled");
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Test to verify the zswap writeback path */
+static int test_zswap_writeback(const char *root, bool wb)
+{
+ int ret = KSFT_FAIL;
+ char *test_group, *test_group_child = NULL;
+
+ if (cg_read_strcmp(root, "memory.zswap.writeback", "1"))
+ return KSFT_SKIP;
+
+ test_group = cg_name(root, "zswap_writeback_test");
+ if (!test_group)
+ goto out;
+ if (cg_create(test_group))
+ goto out;
+ if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0"))
+ goto out;
+
+ if (test_zswap_writeback_one(test_group, wb))
+ goto out;
+
+ /* Reset memory.zswap.max to max (modified by attempt_writeback), and
+ * set up child cgroup, whose memory.zswap.writeback is hardcoded to 1.
+ * Thus, the parent's setting shall be what's in effect. */
+ if (cg_write(test_group, "memory.zswap.max", "max"))
+ goto out;
+ if (cg_write(test_group, "cgroup.subtree_control", "+memory"))
+ goto out;
+
+ test_group_child = cg_name(test_group, "zswap_writeback_test_child");
+ if (!test_group_child)
+ goto out;
+ if (cg_create(test_group_child))
+ goto out;
+ if (cg_write(test_group_child, "memory.zswap.writeback", "1"))
+ goto out;
+
+ if (test_zswap_writeback_one(test_group_child, wb))
+ goto out;
+
+ ret = KSFT_PASS;
+
+out:
+ if (test_group_child) {
+ cg_destroy(test_group_child);
+ free(test_group_child);
+ }
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+static int test_zswap_writeback_enabled(const char *root)
+{
+ return test_zswap_writeback(root, true);
+}
+
+static int test_zswap_writeback_disabled(const char *root)
+{
+ return test_zswap_writeback(root, false);
+}
+
+/*
+ * When trying to store a memcg page in zswap, if the memcg hits its memory
+ * limit in zswap, writeback should affect only the zswapped pages of that
+ * memcg.
+ */
+static int test_no_invasive_cgroup_shrink(const char *root)
+{
+ int ret = KSFT_FAIL;
+ size_t control_allocation_size = MB(10);
+ char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL;
+
+ wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
+ if (!wb_group)
+ return KSFT_FAIL;
+ if (cg_write(wb_group, "memory.zswap.max", "10K"))
+ goto out;
+ control_group = setup_test_group_1M(root, "per_memcg_wb_test2");
+ if (!control_group)
+ goto out;
+
+ /* Push some test_group2 memory into zswap */
+ if (cg_enter_current(control_group))
+ goto out;
+ control_allocation = malloc(control_allocation_size);
+ for (int i = 0; i < control_allocation_size; i += 4095)
+ control_allocation[i] = 'a';
+ if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1)
+ goto out;
+
+ /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */
+ if (cg_run(wb_group, allocate_bytes, (void *)MB(10)))
+ goto out;
+
+ /* Verify that only zswapped memory from gwb_group has been written back */
+ if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0)
+ ret = KSFT_PASS;
+out:
+ cg_enter_current(root);
+ if (control_group) {
+ cg_destroy(control_group);
+ free(control_group);
+ }
+ cg_destroy(wb_group);
+ free(wb_group);
+ if (control_allocation)
+ free(control_allocation);
+ return ret;
+}
+
+struct no_kmem_bypass_child_args {
+ size_t target_alloc_bytes;
+ size_t child_allocated;
+};
+
+static int no_kmem_bypass_child(const char *cgroup, void *arg)
+{
+ struct no_kmem_bypass_child_args *values = arg;
+ void *allocation;
+
+ allocation = malloc(values->target_alloc_bytes);
+ if (!allocation) {
+ values->child_allocated = true;
+ return -1;
+ }
+ for (long i = 0; i < values->target_alloc_bytes; i += 4095)
+ ((char *)allocation)[i] = 'a';
+ values->child_allocated = true;
+ pause();
+ free(allocation);
+ return 0;
+}
+
+/*
+ * When pages owned by a memcg are pushed to zswap by kswapd, they should be
+ * charged to that cgroup. This wasn't the case before commit
+ * cd08d80ecdac("mm: correctly charge compressed memory to its memcg").
+ *
+ * The test first allocates memory in a memcg, then raises min_free_kbytes to
+ * a very high value so that the allocation falls below low wm, then makes
+ * another allocation to trigger kswapd that should push the memcg-owned pages
+ * to zswap and verifies that the zswap pages are correctly charged.
+ *
+ * To be run on a VM with at most 4G of memory.
+ */
+static int test_no_kmem_bypass(const char *root)
+{
+ size_t min_free_kb_high, min_free_kb_low, min_free_kb_original;
+ struct no_kmem_bypass_child_args *values;
+ size_t trigger_allocation_size;
+ int wait_child_iteration = 0;
+ long stored_pages_threshold;
+ struct sysinfo sys_info;
+ int ret = KSFT_FAIL;
+ int child_status;
+ char *test_group = NULL;
+ pid_t child_pid;
+
+ /* Read sys info and compute test values accordingly */
+ if (sysinfo(&sys_info) != 0)
+ return KSFT_FAIL;
+ if (sys_info.totalram > 5000000000)
+ return KSFT_SKIP;
+ values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ |
+ PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (values == MAP_FAILED)
+ return KSFT_FAIL;
+ if (read_min_free_kb(&min_free_kb_original))
+ return KSFT_FAIL;
+ min_free_kb_high = sys_info.totalram / 2000;
+ min_free_kb_low = sys_info.totalram / 500000;
+ values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) +
+ sys_info.totalram * 5 / 100;
+ stored_pages_threshold = sys_info.totalram / 5 / 4096;
+ trigger_allocation_size = sys_info.totalram / 20;
+
+ /* Set up test memcg */
+ test_group = cg_name(root, "kmem_bypass_test");
+ if (!test_group)
+ goto out;
+
+ /* Spawn memcg child and wait for it to allocate */
+ set_min_free_kb(min_free_kb_low);
+ if (cg_create(test_group))
+ goto out;
+ values->child_allocated = false;
+ child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values);
+ if (child_pid < 0)
+ goto out;
+ while (!values->child_allocated && wait_child_iteration++ < 10000)
+ usleep(1000);
+
+ /* Try to wakeup kswapd and let it push child memory to zswap */
+ set_min_free_kb(min_free_kb_high);
+ for (int i = 0; i < 20; i++) {
+ size_t stored_pages;
+ char *trigger_allocation = malloc(trigger_allocation_size);
+
+ if (!trigger_allocation)
+ break;
+ for (int i = 0; i < trigger_allocation_size; i += 4095)
+ trigger_allocation[i] = 'b';
+ usleep(100000);
+ free(trigger_allocation);
+ if (get_zswap_stored_pages(&stored_pages))
+ break;
+ if (stored_pages < 0)
+ break;
+ /* If memory was pushed to zswap, verify it belongs to memcg */
+ if (stored_pages > stored_pages_threshold) {
+ int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped ");
+ int delta = stored_pages * 4096 - zswapped;
+ int result_ok = delta < stored_pages * 4096 / 4;
+
+ ret = result_ok ? KSFT_PASS : KSFT_FAIL;
+ break;
+ }
+ }
+
+ kill(child_pid, SIGTERM);
+ waitpid(child_pid, &child_status, 0);
+out:
+ set_min_free_kb(min_free_kb_original);
+ cg_destroy(test_group);
+ free(test_group);
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct zswap_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_zswap_usage),
+ T(test_swapin_nozswap),
+ T(test_zswapin),
+ T(test_zswap_writeback_enabled),
+ T(test_zswap_writeback_disabled),
+ T(test_no_kmem_bypass),
+ T(test_no_invasive_cgroup_shrink),
+};
+#undef T
+
+static bool zswap_configured(void)
+{
+ return access("/sys/module/zswap", F_OK) == 0;
+}
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ if (!zswap_configured())
+ ksft_exit_skip("zswap isn't configured\n");
+
+ /*
+ * Check that memory controller is available:
+ * memory is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+ ksft_exit_skip("memory controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/wait_inotify.c b/tools/testing/selftests/cgroup/wait_inotify.c
new file mode 100644
index 000000000000..e11b431e1b62
--- /dev/null
+++ b/tools/testing/selftests/cgroup/wait_inotify.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Wait until an inotify event on the given cgroup file.
+ */
+#include <linux/limits.h>
+#include <sys/inotify.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static const char usage[] = "Usage: %s [-v] <cgroup_file>\n";
+static char *file;
+static int verbose;
+
+static inline void fail_message(char *msg)
+{
+ fprintf(stderr, msg, file);
+ exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+ char *cmd = argv[0];
+ int c, fd;
+ struct pollfd fds = { .events = POLLIN, };
+
+ while ((c = getopt(argc, argv, "v")) != -1) {
+ switch (c) {
+ case 'v':
+ verbose++;
+ break;
+ }
+ argv++, argc--;
+ }
+
+ if (argc != 2) {
+ fprintf(stderr, usage, cmd);
+ return -1;
+ }
+ file = argv[1];
+ fd = open(file, O_RDONLY);
+ if (fd < 0)
+ fail_message("Cgroup file %s not found!\n");
+ close(fd);
+
+ fd = inotify_init();
+ if (fd < 0)
+ fail_message("inotify_init() fails on %s!\n");
+ if (inotify_add_watch(fd, file, IN_MODIFY) < 0)
+ fail_message("inotify_add_watch() fails on %s!\n");
+ fds.fd = fd;
+
+ /*
+ * poll waiting loop
+ */
+ for (;;) {
+ int ret = poll(&fds, 1, 10000);
+
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ perror("poll");
+ exit(1);
+ }
+ if ((ret > 0) && (fds.revents & POLLIN))
+ break;
+ }
+ if (verbose) {
+ struct inotify_event events[10];
+ long len;
+
+ usleep(1000);
+ len = read(fd, events, sizeof(events));
+ printf("Number of events read = %ld\n",
+ len/sizeof(struct inotify_event));
+ }
+ close(fd);
+ return 0;
+}