aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/testing/selftests/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests/mm/khugepaged.c')
-rw-r--r--tools/testing/selftests/mm/khugepaged.c1287
1 files changed, 1287 insertions, 0 deletions
diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
new file mode 100644
index 000000000000..8a4d34cce36b
--- /dev/null
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -0,0 +1,1287 @@
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <dirent.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+
+#include "linux/magic.h"
+
+#include "vm_util.h"
+#include "thp_settings.h"
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+static int anon_order;
+
+#define PID_SMAPS "/proc/self/smaps"
+#define TEST_FILE "collapse_test_file"
+
+#define MAX_LINE_LENGTH 500
+
+enum vma_type {
+ VMA_ANON,
+ VMA_FILE,
+ VMA_SHMEM,
+};
+
+struct mem_ops {
+ void *(*setup_area)(int nr_hpages);
+ void (*cleanup_area)(void *p, unsigned long size);
+ void (*fault)(void *p, unsigned long start, unsigned long end);
+ bool (*check_huge)(void *addr, int nr_hpages);
+ const char *name;
+};
+
+static struct mem_ops *file_ops;
+static struct mem_ops *anon_ops;
+static struct mem_ops *shmem_ops;
+
+struct collapse_context {
+ void (*collapse)(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect);
+ bool enforce_pte_scan_limits;
+ const char *name;
+};
+
+static struct collapse_context *khugepaged_context;
+static struct collapse_context *madvise_context;
+
+struct file_info {
+ const char *dir;
+ char path[PATH_MAX];
+ enum vma_type type;
+ int fd;
+ char dev_queue_read_ahead_path[PATH_MAX];
+};
+
+static struct file_info finfo;
+static bool skip_settings_restore;
+static int exit_status;
+
+static void success(const char *msg)
+{
+ printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+ printf(" \e[31m%s\e[0m\n", msg);
+ exit_status++;
+}
+
+static void skip(const char *msg)
+{
+ printf(" \e[33m%s\e[0m\n", msg);
+}
+
+static void restore_settings_atexit(void)
+{
+ if (skip_settings_restore)
+ return;
+
+ printf("Restore THP and khugepaged settings...");
+ thp_restore_settings();
+ success("OK");
+
+ skip_settings_restore = true;
+}
+
+static void restore_settings(int sig)
+{
+ /* exit() will invoke the restore_settings_atexit handler. */
+ exit(sig ? EXIT_FAILURE : exit_status);
+}
+
+static void save_settings(void)
+{
+ printf("Save THP and khugepaged settings...");
+ if (file_ops && finfo.type == VMA_FILE)
+ thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
+ thp_save_settings();
+
+ success("OK");
+
+ atexit(restore_settings_atexit);
+ signal(SIGTERM, restore_settings);
+ signal(SIGINT, restore_settings);
+ signal(SIGHUP, restore_settings);
+ signal(SIGQUIT, restore_settings);
+}
+
+static void get_finfo(const char *dir)
+{
+ struct stat path_stat;
+ struct statfs fs;
+ char buf[1 << 10];
+ char path[PATH_MAX];
+ char *str, *end;
+
+ finfo.dir = dir;
+ stat(finfo.dir, &path_stat);
+ if (!S_ISDIR(path_stat.st_mode)) {
+ printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
+ exit(EXIT_FAILURE);
+ }
+ if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
+ finfo.dir) >= sizeof(finfo.path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ if (statfs(finfo.dir, &fs)) {
+ perror("statfs()");
+ exit(EXIT_FAILURE);
+ }
+ finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
+ if (finfo.type == VMA_SHMEM)
+ return;
+
+ /* Find owning device's queue/read_ahead_kb control */
+ if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
+ major(path_stat.st_dev), minor(path_stat.st_dev))
+ >= sizeof(path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ if (read_file(path, buf, sizeof(buf)) < 0) {
+ perror("read_file(read_num)");
+ exit(EXIT_FAILURE);
+ }
+ if (strstr(buf, "DEVTYPE=disk")) {
+ /* Found it */
+ if (snprintf(finfo.dev_queue_read_ahead_path,
+ sizeof(finfo.dev_queue_read_ahead_path),
+ "/sys/dev/block/%d:%d/queue/read_ahead_kb",
+ major(path_stat.st_dev), minor(path_stat.st_dev))
+ >= sizeof(finfo.dev_queue_read_ahead_path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ return;
+ }
+ if (!strstr(buf, "DEVTYPE=partition")) {
+ printf("%s: Unknown device type: %s\n", __func__, path);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Partition of block device - need to find actual device.
+ * Using naming convention that devnameN is partition of
+ * device devname.
+ */
+ str = strstr(buf, "DEVNAME=");
+ if (!str) {
+ printf("%s: Could not read: %s", __func__, path);
+ exit(EXIT_FAILURE);
+ }
+ str += 8;
+ end = str;
+ while (*end) {
+ if (isdigit(*end)) {
+ *end = '\0';
+ if (snprintf(finfo.dev_queue_read_ahead_path,
+ sizeof(finfo.dev_queue_read_ahead_path),
+ "/sys/block/%s/queue/read_ahead_kb",
+ str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ return;
+ }
+ ++end;
+ }
+ printf("%s: Could not read: %s\n", __func__, path);
+ exit(EXIT_FAILURE);
+}
+
+static bool check_swap(void *addr, unsigned long size)
+{
+ bool swap = false;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+
+ fp = fopen(PID_SMAPS, "r");
+ if (!fp) {
+ printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+ exit(EXIT_FAILURE);
+ }
+ if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+ goto err_out;
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
+ size >> 10);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Fetch the Swap: in the same block and check whether it got
+ * the expected number of hugeepages next.
+ */
+ if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
+ goto err_out;
+
+ if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+ goto err_out;
+
+ swap = true;
+err_out:
+ fclose(fp);
+ return swap;
+}
+
+static void *alloc_mapping(int nr)
+{
+ void *p;
+
+ p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p != BASE_ADDR) {
+ printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+ exit(EXIT_FAILURE);
+ }
+
+ return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++)
+ p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+/*
+ * MADV_COLLAPSE is a best-effort request and may fail if an internal
+ * resource is temporarily unavailable, in which case it will set errno to
+ * EAGAIN. In such a case, immediately reattempt the operation one more
+ * time.
+ */
+static int madvise_collapse_retry(void *p, unsigned long size)
+{
+ bool retry = true;
+ int ret;
+
+retry:
+ ret = madvise(p, size, MADV_COLLAPSE);
+ if (ret && errno == EAGAIN && retry) {
+ retry = false;
+ goto retry;
+ }
+ return ret;
+}
+
+/*
+ * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
+ * validate_memory()'able contents.
+ */
+static void *alloc_hpage(struct mem_ops *ops)
+{
+ void *p = ops->setup_area(1);
+
+ ops->fault(p, 0, hpage_pmd_size);
+
+ /*
+ * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
+ * The latter is ineligible for collapse by MADV_COLLAPSE
+ * while the former might cause MADV_COLLAPSE to race with
+ * khugepaged on low-load system (like a test machine), which
+ * would cause MADV_COLLAPSE to fail with EAGAIN.
+ */
+ printf("Allocate huge page...");
+ if (madvise_collapse_retry(p, hpage_pmd_size)) {
+ perror("madvise(MADV_COLLAPSE)");
+ exit(EXIT_FAILURE);
+ }
+ if (!ops->check_huge(p, 1)) {
+ perror("madvise(MADV_COLLAPSE)");
+ exit(EXIT_FAILURE);
+ }
+ if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
+ perror("madvise(MADV_HUGEPAGE)");
+ exit(EXIT_FAILURE);
+ }
+ success("OK");
+ return p;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++) {
+ if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+ printf("Page %d is corrupted: %#x\n",
+ i, p[i * page_size / sizeof(*p)]);
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+static void *anon_setup_area(int nr_hpages)
+{
+ return alloc_mapping(nr_hpages);
+}
+
+static void anon_cleanup_area(void *p, unsigned long size)
+{
+ munmap(p, size);
+}
+
+static void anon_fault(void *p, unsigned long start, unsigned long end)
+{
+ fill_memory(p, start, end);
+}
+
+static bool anon_check_huge(void *addr, int nr_hpages)
+{
+ return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
+}
+
+static void *file_setup_area(int nr_hpages)
+{
+ int fd;
+ void *p;
+ unsigned long size;
+
+ unlink(finfo.path); /* Cleanup from previous failed tests */
+ printf("Creating %s for collapse%s...", finfo.path,
+ finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
+ fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
+ 777);
+ if (fd < 0) {
+ perror("open()");
+ exit(EXIT_FAILURE);
+ }
+
+ size = nr_hpages * hpage_pmd_size;
+ p = alloc_mapping(nr_hpages);
+ fill_memory(p, 0, size);
+ write(fd, p, size);
+ close(fd);
+ munmap(p, size);
+ success("OK");
+
+ printf("Opening %s read only for collapse...", finfo.path);
+ finfo.fd = open(finfo.path, O_RDONLY, 777);
+ if (finfo.fd < 0) {
+ perror("open()");
+ exit(EXIT_FAILURE);
+ }
+ p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
+ MAP_PRIVATE, finfo.fd, 0);
+ if (p == MAP_FAILED || p != BASE_ADDR) {
+ perror("mmap()");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Drop page cache */
+ write_file("/proc/sys/vm/drop_caches", "3", 2);
+ success("OK");
+ return p;
+}
+
+static void file_cleanup_area(void *p, unsigned long size)
+{
+ munmap(p, size);
+ close(finfo.fd);
+ unlink(finfo.path);
+}
+
+static void file_fault(void *p, unsigned long start, unsigned long end)
+{
+ if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
+ perror("madvise(MADV_POPULATE_READ");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static bool file_check_huge(void *addr, int nr_hpages)
+{
+ switch (finfo.type) {
+ case VMA_FILE:
+ return check_huge_file(addr, nr_hpages, hpage_pmd_size);
+ case VMA_SHMEM:
+ return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+ default:
+ exit(EXIT_FAILURE);
+ return false;
+ }
+}
+
+static void *shmem_setup_area(int nr_hpages)
+{
+ void *p;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
+ if (finfo.fd < 0) {
+ perror("memfd_create()");
+ exit(EXIT_FAILURE);
+ }
+ if (ftruncate(finfo.fd, size)) {
+ perror("ftruncate()");
+ exit(EXIT_FAILURE);
+ }
+ p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
+ 0);
+ if (p != BASE_ADDR) {
+ perror("mmap()");
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+static void shmem_cleanup_area(void *p, unsigned long size)
+{
+ munmap(p, size);
+ close(finfo.fd);
+}
+
+static bool shmem_check_huge(void *addr, int nr_hpages)
+{
+ return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+}
+
+static struct mem_ops __anon_ops = {
+ .setup_area = &anon_setup_area,
+ .cleanup_area = &anon_cleanup_area,
+ .fault = &anon_fault,
+ .check_huge = &anon_check_huge,
+ .name = "anon",
+};
+
+static struct mem_ops __file_ops = {
+ .setup_area = &file_setup_area,
+ .cleanup_area = &file_cleanup_area,
+ .fault = &file_fault,
+ .check_huge = &file_check_huge,
+ .name = "file",
+};
+
+static struct mem_ops __shmem_ops = {
+ .setup_area = &shmem_setup_area,
+ .cleanup_area = &shmem_cleanup_area,
+ .fault = &anon_fault,
+ .check_huge = &shmem_check_huge,
+ .name = "shmem",
+};
+
+static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect)
+{
+ int ret;
+ struct thp_settings settings = *thp_current_settings();
+
+ printf("%s...", msg);
+
+ /*
+ * Prevent khugepaged interference and tests that MADV_COLLAPSE
+ * ignores /sys/kernel/mm/transparent_hugepage/enabled
+ */
+ settings.thp_enabled = THP_NEVER;
+ settings.shmem_enabled = SHMEM_NEVER;
+ thp_push_settings(&settings);
+
+ /* Clear VM_NOHUGEPAGE */
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+ ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
+ if (((bool)ret) == expect)
+ fail("Fail: Bad return value");
+ else if (!ops->check_huge(p, expect ? nr_hpages : 0))
+ fail("Fail: check_huge()");
+ else
+ success("OK");
+
+ thp_pop_settings();
+}
+
+static void madvise_collapse(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect)
+{
+ /* Sanity check */
+ if (!ops->check_huge(p, 0)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+ __madvise_collapse(msg, p, nr_hpages, ops, expect);
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops)
+{
+ int full_scans;
+ int timeout = 6; /* 3 seconds */
+
+ /* Sanity check */
+ if (!ops->check_huge(p, 0)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+
+ /* Wait until the second full_scan completed */
+ full_scans = thp_read_num("khugepaged/full_scans") + 2;
+
+ printf("%s...", msg);
+ while (timeout--) {
+ if (ops->check_huge(p, nr_hpages))
+ break;
+ if (thp_read_num("khugepaged/full_scans") >= full_scans)
+ break;
+ printf(".");
+ usleep(TICK);
+ }
+
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ return timeout == -1;
+}
+
+static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect)
+{
+ if (wait_for_scan(msg, p, nr_hpages, ops)) {
+ if (expect)
+ fail("Timeout");
+ else
+ success("OK");
+ return;
+ }
+
+ /*
+ * For file and shmem memory, khugepaged only retracts pte entries after
+ * putting the new hugepage in the page cache. The hugepage must be
+ * subsequently refaulted to install the pmd mapping for the mm.
+ */
+ if (ops != &__anon_ops)
+ ops->fault(p, 0, nr_hpages * hpage_pmd_size);
+
+ if (ops->check_huge(p, expect ? nr_hpages : 0))
+ success("OK");
+ else
+ fail("Fail");
+}
+
+static struct collapse_context __khugepaged_context = {
+ .collapse = &khugepaged_collapse,
+ .enforce_pte_scan_limits = true,
+ .name = "khugepaged",
+};
+
+static struct collapse_context __madvise_context = {
+ .collapse = &madvise_collapse,
+ .enforce_pte_scan_limits = false,
+ .name = "madvise",
+};
+
+static bool is_tmpfs(struct mem_ops *ops)
+{
+ return ops == &__file_ops && finfo.type == VMA_SHMEM;
+}
+
+static bool is_anon(struct mem_ops *ops)
+{
+ return ops == &__anon_ops;
+}
+
+static void alloc_at_fault(void)
+{
+ struct thp_settings settings = *thp_current_settings();
+ char *p;
+
+ settings.thp_enabled = THP_ALWAYS;
+ thp_push_settings(&settings);
+
+ p = alloc_mapping(1);
+ *p = 1;
+ printf("Allocate huge page on fault...");
+ if (check_huge_anon(p, 1, hpage_pmd_size))
+ success("OK");
+ else
+ fail("Fail");
+
+ thp_pop_settings();
+
+ madvise(p, page_size, MADV_DONTNEED);
+ printf("Split huge PMD on MADV_DONTNEED...");
+ if (check_huge_anon(p, 0, hpage_pmd_size))
+ success("OK");
+ else
+ fail("Fail");
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+ int nr_hpages = 4;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ p = ops->setup_area(nr_hpages);
+ ops->fault(p, 0, size);
+ c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
+ ops, true);
+ validate_memory(p, 0, size);
+ ops->cleanup_area(p, size);
+}
+
+static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, page_size);
+ c->collapse("Collapse PTE table with single PTE entry present", p,
+ 1, ops, true);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
+{
+ int max_ptes_none = hpage_pmd_nr / 2;
+ struct thp_settings settings = *thp_current_settings();
+ void *p;
+ int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;
+
+ settings.khugepaged.max_ptes_none = max_ptes_none;
+ thp_push_settings(&settings);
+
+ p = ops->setup_area(1);
+
+ if (is_tmpfs(ops)) {
+ /* shmem pages always in the page cache */
+ printf("tmpfs...");
+ skip("Skip");
+ goto skip;
+ }
+
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
+ c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
+ ops, !c->enforce_pte_scan_limits);
+ validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
+
+ if (c->enforce_pte_scan_limits) {
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+ c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
+ true);
+ validate_memory(p, 0,
+ (hpage_pmd_nr - max_ptes_none) * page_size);
+ }
+skip:
+ ops->cleanup_area(p, hpage_pmd_size);
+ thp_pop_settings();
+}
+
+static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, hpage_pmd_size);
+
+ printf("Swapout one page...");
+ if (madvise(p, page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
+ true);
+ validate_memory(p, 0, hpage_pmd_size);
+out:
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
+{
+ int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, hpage_pmd_size);
+
+ printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+ if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
+ !c->enforce_pte_scan_limits);
+ validate_memory(p, 0, hpage_pmd_size);
+
+ if (c->enforce_pte_scan_limits) {
+ ops->fault(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap,
+ hpage_pmd_nr);
+ if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, max_ptes_swap * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ c->collapse("Collapse with max_ptes_swap pages swapped out", p,
+ 1, ops, true);
+ validate_memory(p, 0, hpage_pmd_size);
+ }
+out:
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = alloc_hpage(ops);
+
+ if (is_tmpfs(ops)) {
+ /* MADV_DONTNEED won't evict tmpfs pages */
+ printf("tmpfs...");
+ skip("Skip");
+ goto skip;
+ }
+
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ printf("Split huge page leaving single PTE mapping compound page...");
+ madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse PTE table with single PTE mapping compound page",
+ p, 1, ops, true);
+ validate_memory(p, 0, page_size);
+skip:
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = alloc_hpage(ops);
+ printf("Split huge page leaving single PTE page table full of compound pages...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
+ true);
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+ int i;
+
+ p = ops->setup_area(1);
+ for (i = 0; i < hpage_pmd_nr; i++) {
+ printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+ i + 1, hpage_pmd_nr);
+
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+ ops->fault(BASE_ADDR, 0, hpage_pmd_size);
+ if (!ops->check_huge(BASE_ADDR, 1)) {
+ printf("Failed to allocate huge page\n");
+ exit(EXIT_FAILURE);
+ }
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ p = mremap(BASE_ADDR - i * page_size,
+ i * page_size + hpage_pmd_size,
+ (i + 1) * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR + 2 * hpage_pmd_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+unmap");
+ exit(EXIT_FAILURE);
+ }
+
+ p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+ (i + 1) * page_size,
+ (i + 1) * page_size + hpage_pmd_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR - (i + 1) * page_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+alloc");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
+ ops->fault(p, 0, hpage_pmd_size);
+ if (!ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse PTE table full of different compound pages", p, 1,
+ ops, true);
+
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
+{
+ int wstatus;
+ void *p;
+
+ p = ops->setup_area(1);
+
+ printf("Allocate small page...");
+ ops->fault(p, 0, page_size);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share small page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ ops->fault(p, page_size, 2 * page_size);
+ c->collapse("Collapse PTE table with single page shared with parent process",
+ p, 1, ops, true);
+
+ validate_memory(p, 0, page_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has small page...");
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+ int wstatus;
+ void *p;
+
+ p = alloc_hpage(ops);
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Split huge page PMD in child process...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+ ops->fault(p, 0, page_size);
+
+ thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
+ c->collapse("Collapse PTE table full of compound pages in child",
+ p, 1, ops, true);
+ thp_write_num("khugepaged/max_ptes_shared",
+ thp_current_settings()->khugepaged.max_ptes_shared);
+
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
+{
+ int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
+ int wstatus;
+ void *p;
+
+ p = alloc_hpage(ops);
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
+ 1, ops, !c->enforce_pte_scan_limits);
+
+ if (c->enforce_pte_scan_limits) {
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
+ page_size);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse with max_ptes_shared PTEs shared",
+ p, 1, ops, true);
+ }
+
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void madvise_collapse_existing_thps(struct collapse_context *c,
+ struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, hpage_pmd_size);
+ c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
+ validate_memory(p, 0, hpage_pmd_size);
+
+ /* c->collapse() will find a hugepage and complain - call directly. */
+ __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+/*
+ * Test race with khugepaged where page tables have been retracted and
+ * pmd cleared.
+ */
+static void madvise_retracted_page_tables(struct collapse_context *c,
+ struct mem_ops *ops)
+{
+ void *p;
+ int nr_hpages = 1;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ p = ops->setup_area(nr_hpages);
+ ops->fault(p, 0, size);
+
+ /* Let khugepaged collapse and leave pmd cleared */
+ if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
+ ops)) {
+ fail("Timeout");
+ return;
+ }
+ success("OK");
+ c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
+ true);
+ validate_memory(p, 0, size);
+ ops->cleanup_area(p, size);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
+ fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
+ fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
+ fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
+ fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
+ fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
+ fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
+ fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
+ fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n");
+ fprintf(stderr, "\n\tSupported Options:\n");
+ fprintf(stderr, "\t\t-h: This help message.\n");
+ fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n");
+ fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n");
+ exit(1);
+}
+
+static void parse_test_type(int argc, char **argv)
+{
+ int opt;
+ char *buf;
+ const char *token;
+
+ while ((opt = getopt(argc, argv, "s:h")) != -1) {
+ switch (opt) {
+ case 's':
+ anon_order = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+
+ argv += optind;
+ argc -= optind;
+
+ if (argc == 0) {
+ /* Backwards compatibility */
+ khugepaged_context = &__khugepaged_context;
+ madvise_context = &__madvise_context;
+ anon_ops = &__anon_ops;
+ return;
+ }
+
+ buf = strdup(argv[0]);
+ token = strsep(&buf, ":");
+
+ if (!strcmp(token, "all")) {
+ khugepaged_context = &__khugepaged_context;
+ madvise_context = &__madvise_context;
+ } else if (!strcmp(token, "khugepaged")) {
+ khugepaged_context = &__khugepaged_context;
+ } else if (!strcmp(token, "madvise")) {
+ madvise_context = &__madvise_context;
+ } else {
+ usage();
+ }
+
+ if (!buf)
+ usage();
+
+ if (!strcmp(buf, "all")) {
+ file_ops = &__file_ops;
+ anon_ops = &__anon_ops;
+ shmem_ops = &__shmem_ops;
+ } else if (!strcmp(buf, "anon")) {
+ anon_ops = &__anon_ops;
+ } else if (!strcmp(buf, "file")) {
+ file_ops = &__file_ops;
+ } else if (!strcmp(buf, "shmem")) {
+ shmem_ops = &__shmem_ops;
+ } else {
+ usage();
+ }
+
+ if (!file_ops)
+ return;
+
+ if (argc != 2)
+ usage();
+
+ get_finfo(argv[1]);
+}
+
+int main(int argc, char **argv)
+{
+ int hpage_pmd_order;
+ struct thp_settings default_settings = {
+ .thp_enabled = THP_MADVISE,
+ .thp_defrag = THP_DEFRAG_ALWAYS,
+ .shmem_enabled = SHMEM_ADVISE,
+ .use_zero_page = 0,
+ .khugepaged = {
+ .defrag = 1,
+ .alloc_sleep_millisecs = 10,
+ .scan_sleep_millisecs = 10,
+ },
+ /*
+ * When testing file-backed memory, the collapse path
+ * looks at how many pages are found in the page cache, not
+ * what pages are mapped. Disable read ahead optimization so
+ * pages don't find their way into the page cache unless
+ * we mem_ops->fault() them in.
+ */
+ .read_ahead_kb = 0,
+ };
+
+ parse_test_type(argc, argv);
+
+ setbuf(stdout, NULL);
+
+ page_size = getpagesize();
+ hpage_pmd_size = read_pmd_pagesize();
+ if (!hpage_pmd_size) {
+ printf("Reading PMD pagesize failed");
+ exit(EXIT_FAILURE);
+ }
+ hpage_pmd_nr = hpage_pmd_size / page_size;
+ hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
+
+ default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+ default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+ default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
+ default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+ default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
+ default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
+ default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
+ default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;
+
+ save_settings();
+ thp_push_settings(&default_settings);
+
+ alloc_at_fault();
+
+#define TEST(t, c, o) do { \
+ if (c && o) { \
+ printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
+ t(c, o); \
+ } \
+ } while (0)
+
+ TEST(collapse_full, khugepaged_context, anon_ops);
+ TEST(collapse_full, khugepaged_context, file_ops);
+ TEST(collapse_full, khugepaged_context, shmem_ops);
+ TEST(collapse_full, madvise_context, anon_ops);
+ TEST(collapse_full, madvise_context, file_ops);
+ TEST(collapse_full, madvise_context, shmem_ops);
+
+ TEST(collapse_empty, khugepaged_context, anon_ops);
+ TEST(collapse_empty, madvise_context, anon_ops);
+
+ TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
+ TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
+ TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
+ TEST(collapse_single_pte_entry, madvise_context, anon_ops);
+ TEST(collapse_single_pte_entry, madvise_context, file_ops);
+ TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
+
+ TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
+ TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
+ TEST(collapse_max_ptes_none, madvise_context, anon_ops);
+ TEST(collapse_max_ptes_none, madvise_context, file_ops);
+
+ TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
+ TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
+ TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
+ TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
+
+ TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
+ TEST(collapse_full_of_compound, khugepaged_context, file_ops);
+ TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
+ TEST(collapse_full_of_compound, madvise_context, anon_ops);
+ TEST(collapse_full_of_compound, madvise_context, file_ops);
+ TEST(collapse_full_of_compound, madvise_context, shmem_ops);
+
+ TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
+ TEST(collapse_compound_extreme, madvise_context, anon_ops);
+
+ TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
+ TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
+
+ TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
+ TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
+
+ TEST(collapse_fork, khugepaged_context, anon_ops);
+ TEST(collapse_fork, madvise_context, anon_ops);
+
+ TEST(collapse_fork_compound, khugepaged_context, anon_ops);
+ TEST(collapse_fork_compound, madvise_context, anon_ops);
+
+ TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
+ TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
+
+ TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
+ TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
+ TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
+
+ TEST(madvise_retracted_page_tables, madvise_context, file_ops);
+ TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
+
+ restore_settings(0);
+}