diff options
Diffstat (limited to '')
65 files changed, 23086 insertions, 159 deletions
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore new file mode 100644 index 000000000000..0b9ab987601c --- /dev/null +++ b/tools/testing/selftests/mm/.gitignore @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: GPL-2.0-only +cow +hugepage-mmap +hugepage-mremap +hugepage-shm +hugepage-vmemmap +hugetlb-madvise +hugetlb-read-hwpoison +khugepaged +map_hugetlb +map_populate +thuge-gen +compaction_test +migration +mlock2-tests +mrelease_test +mremap_dontunmap +mremap_test +on-fault-limit +transhuge-stress +pagemap_ioctl +*.tmp* +protection_keys +protection_keys_32 +protection_keys_64 +madv_populate +uffd-stress +uffd-unit-tests +mlock-intersect-test +mlock-random-test +virtual_address_range +gup_test +va_128TBswitch +map_fixed_noreplace +write_to_hugetlbfs +hmm-tests +memfd_secret +soft-dirty +split_huge_page_test +ksm_tests +local_config.h +local_config.mk +ksm_functional_tests +mdwe_test +gup_longterm +mkdirty +va_high_addr_switch +hugetlb_fault_after_madv +hugetlb_madv_vs_map +mseal_test +seal_elf diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile new file mode 100644 index 000000000000..3b49bc3d0a3b --- /dev/null +++ b/tools/testing/selftests/mm/Makefile @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for mm selftests + +LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h + +include local_config.mk + +ifeq ($(ARCH),) + +ifeq ($(CROSS_COMPILE),) +uname_M := $(shell uname -m 2>/dev/null || echo not) +else +uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') +endif +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/powerpc/') +endif + +# Without this, failed build products remain, with up-to-date timestamps, +# thus tricking Make (and you!) into believing that All Is Well, in subsequent +# make invocations: +.DELETE_ON_ERROR: + +# Avoid accidental wrong builds, due to built-in rules working just a little +# bit too well--but not quite as well as required for our situation here. +# +# In other words, "make $SOME_TEST" is supposed to fail to build at all, +# because this Makefile only supports either "make" (all), or "make /full/path". +# However, the built-in rules, if not suppressed, will pick up CFLAGS and the +# initial LDLIBS (but not the target-specific LDLIBS, because those are only +# set for the full path target!). This causes it to get pretty far into building +# things despite using incorrect values such as an *occasionally* incomplete +# LDLIBS. +MAKEFLAGS += --no-builtin-rules + +CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS = -lrt -lpthread -lm + +TEST_GEN_FILES = cow +TEST_GEN_FILES += compaction_test +TEST_GEN_FILES += gup_longterm +TEST_GEN_FILES += gup_test +TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise +TEST_GEN_FILES += hugetlb-read-hwpoison +TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap +TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap +TEST_GEN_FILES += khugepaged +TEST_GEN_FILES += madv_populate +TEST_GEN_FILES += map_fixed_noreplace +TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret +TEST_GEN_FILES += migration +TEST_GEN_FILES += mkdirty +TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mrelease_test +TEST_GEN_FILES += mremap_dontunmap +TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += mseal_test +TEST_GEN_FILES += seal_elf +TEST_GEN_FILES += on-fault-limit +TEST_GEN_FILES += pagemap_ioctl +TEST_GEN_FILES += thuge-gen +TEST_GEN_FILES += transhuge-stress +TEST_GEN_FILES += uffd-stress +TEST_GEN_FILES += uffd-unit-tests +TEST_GEN_FILES += split_huge_page_test +TEST_GEN_FILES += ksm_tests +TEST_GEN_FILES += ksm_functional_tests +TEST_GEN_FILES += mdwe_test +TEST_GEN_FILES += hugetlb_fault_after_madv +TEST_GEN_FILES += hugetlb_madv_vs_map + +ifneq ($(ARCH),arm64) +TEST_GEN_FILES += soft-dirty +endif + +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) + +VMTARGETS := protection_keys +BINARIES_32 := $(VMTARGETS:%=%_32) +BINARIES_64 := $(VMTARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else + +ifneq (,$(findstring $(ARCH),powerpc)) +TEST_GEN_FILES += protection_keys +endif + +endif + +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64)) +TEST_GEN_FILES += va_high_addr_switch +TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += write_to_hugetlbfs +endif + +TEST_PROGS := run_vmtests.sh + +TEST_FILES := test_vmalloc.sh +TEST_FILES += test_hmm.sh +TEST_FILES += va_high_addr_switch.sh +TEST_FILES += charge_reserved_hugetlb.sh +TEST_FILES += hugetlb_reparenting_test.sh + +# required by charge_reserved_hugetlb.sh +TEST_FILES += write_hugetlb_memory.sh + +include ../lib.mk + +$(TEST_GEN_PROGS): vm_util.c thp_settings.c +$(TEST_GEN_FILES): vm_util.c thp_settings.c + +$(OUTPUT)/uffd-stress: uffd-common.c +$(OUTPUT)/uffd-unit-tests: uffd-common.c + +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 -mxsave +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): $(OUTPUT)/%_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 -mxsave +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): $(OUTPUT)/%_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + +# IOURING_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/cow: LDLIBS += $(IOURING_EXTRA_LIBS) + +$(OUTPUT)/gup_longterm: LDLIBS += $(IOURING_EXTRA_LIBS) + +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap + +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + +$(OUTPUT)/migration: LDLIBS += -lnuma + +local_config.mk local_config.h: check_config.sh + /bin/sh ./check_config.sh $(CC) + +EXTRA_CLEAN += local_config.mk local_config.h + +ifeq ($(IOURING_EXTRA_LIBS),) +all: warn_missing_liburing + +warn_missing_liburing: + @echo ; \ + echo "Warning: missing liburing support. Some tests will be skipped." ; \ + echo +endif diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index 18d33684faad..d680c00d2853 100644..100755 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -1,13 +1,18 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi +nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + fault_limit_file=limit_in_bytes reservation_limit_file=rsvd.limit_in_bytes fault_usage_file=usage_in_bytes @@ -21,19 +26,23 @@ if [[ "$1" == "-cgroup-v2" ]]; then reservation_usage_file=rsvd.current fi -cgroup_path=/dev/cgroup/memory -if [[ ! -e $cgroup_path ]]; then - mkdir -p $cgroup_path - if [[ $cgroup2 ]]; then +if [[ $cgroup2 ]]; then + cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory mount -t cgroup2 none $cgroup_path - else + do_umount=1 + fi + echo "+hugetlb" >$cgroup_path/cgroup.subtree_control +else + cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory mount -t cgroup memory,hugetlb $cgroup_path + do_umount=1 fi fi - -if [[ $cgroup2 ]]; then - echo "+hugetlb" >/dev/cgroup/memory/cgroup.subtree_control -fi +export cgroup_path function cleanup() { if [[ $cgroup2 ]]; then @@ -105,7 +114,7 @@ function setup_cgroup() { function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" # Wait for hugetlbfs memory to get depleted. while [ $(cat $path) != 0 ]; do echo Waiting for hugetlb memory to get depleted. @@ -118,7 +127,7 @@ function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" # Wait for hugetlbfs memory to get written. while [ $(cat $path) != $size ]; do echo Waiting for hugetlb memory reservation to reach size $size. @@ -131,7 +140,7 @@ function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" # Wait for hugetlbfs memory to get written. while [ $(cat $path) != $size ]; do echo Waiting for hugetlb memory to reach size $size. @@ -571,5 +580,9 @@ for populate in "" "-o"; do done # populate done # method -umount $cgroup_path -rmdir $cgroup_path +if [[ $do_umount ]]; then + umount $cgroup_path + rmdir $cgroup_path +fi + +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh new file mode 100755 index 000000000000..3954f4746161 --- /dev/null +++ b/tools/testing/selftests/mm/check_config.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Probe for libraries and create header files to record the results. Both C +# header files and Makefile include fragments are created. + +OUTPUT_H_FILE=local_config.h +OUTPUT_MKFILE=local_config.mk + +tmpname=$(mktemp) +tmpfile_c=${tmpname}.c +tmpfile_o=${tmpname}.o + +# liburing +echo "#include <sys/types.h>" > $tmpfile_c +echo "#include <liburing.h>" >> $tmpfile_c +echo "int func(void) { return 0; }" >> $tmpfile_c + +CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"} +$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 + +if [ -f $tmpfile_o ]; then + echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE + echo "IOURING_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE +else + echo "// No liburing support found" > $OUTPUT_H_FILE + echo "# No liburing support found, so:" > $OUTPUT_MKFILE + echo "IOURING_EXTRA_LIBS = " >> $OUTPUT_MKFILE +fi + +rm ${tmpname}.* diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c new file mode 100644 index 000000000000..e140558e6f53 --- /dev/null +++ b/tools/testing/selftests/mm/compaction_test.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * A test for the patch "Allow compaction of unevictable pages". + * With this patch we should be able to allocate at least 1/4 + * of RAM in huge pages. Without the patch much less is + * allocated. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> + +#include "../kselftest.h" + +#define MAP_SIZE_MB 100 +#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) + +struct map_list { + void *map; + struct map_list *next; +}; + +int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) +{ + char buffer[256] = {0}; + char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; + FILE *cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + ksft_print_msg("Failed to read meminfo: %s\n", strerror(errno)); + return -1; + } + + pclose(cmdfile); + + *memfree = atoll(buffer); + cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; + cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + ksft_print_msg("Failed to read meminfo: %s\n", strerror(errno)); + return -1; + } + + pclose(cmdfile); + *hugepagesize = atoll(buffer); + + return 0; +} + +int prereq(void) +{ + char allowed; + int fd; + + fd = open("/proc/sys/vm/compact_unevictable_allowed", + O_RDONLY | O_NONBLOCK); + if (fd < 0) { + ksft_print_msg("Failed to open /proc/sys/vm/compact_unevictable_allowed: %s\n", + strerror(errno)); + return -1; + } + + if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { + ksft_print_msg("Failed to read from /proc/sys/vm/compact_unevictable_allowed: %s\n", + strerror(errno)); + close(fd); + return -1; + } + + close(fd); + if (allowed == '1') + return 0; + + ksft_print_msg("Compaction isn't allowed\n"); + return -1; +} + +int check_compaction(unsigned long mem_free, unsigned long hugepage_size, + unsigned long initial_nr_hugepages) +{ + unsigned long nr_hugepages_ul; + int fd, ret = -1; + int compaction_index = 0; + char nr_hugepages[20] = {0}; + char init_nr_hugepages[20] = {0}; + + sprintf(init_nr_hugepages, "%lu", initial_nr_hugepages); + + /* We want to test with 80% of available memory. Else, OOM killer comes + in to play */ + mem_free = mem_free * 0.8; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + ret = -1; + goto out; + } + + /* Request a large number of huge pages. The Kernel will allocate + as much as it can */ + if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { + ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + /* We should have been able to request at least 1/3 rd of the memory in + huge pages */ + nr_hugepages_ul = strtoul(nr_hugepages, NULL, 10); + if (!nr_hugepages_ul) { + ksft_print_msg("ERROR: No memory is available as huge pages\n"); + goto close_fd; + } + compaction_index = mem_free/(nr_hugepages_ul * hugepage_size); + + lseek(fd, 0, SEEK_SET); + + if (write(fd, init_nr_hugepages, strlen(init_nr_hugepages)) + != strlen(init_nr_hugepages)) { + ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + ksft_print_msg("Number of huge pages allocated = %lu\n", + nr_hugepages_ul); + + if (compaction_index > 3) { + ksft_print_msg("ERROR: Less than 1/%d of memory is available\n" + "as huge pages\n", compaction_index); + goto close_fd; + } + + ret = 0; + + close_fd: + close(fd); + out: + ksft_test_result(ret == 0, "check_compaction\n"); + return ret; +} + +int set_zero_hugepages(unsigned long *initial_nr_hugepages) +{ + int fd, ret = -1; + char nr_hugepages[20] = {0}; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto out; + } + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + /* Start with the initial condition of 0 huge pages */ + if (write(fd, "0", sizeof(char)) != sizeof(char)) { + ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + goto close_fd; + } + + *initial_nr_hugepages = strtoul(nr_hugepages, NULL, 10); + ret = 0; + + close_fd: + close(fd); + + out: + return ret; +} + +int main(int argc, char **argv) +{ + struct rlimit lim; + struct map_list *list = NULL, *entry; + size_t page_size, i; + void *map = NULL; + unsigned long mem_free = 0; + unsigned long hugepage_size = 0; + long mem_fragmentable_MB = 0; + unsigned long initial_nr_hugepages; + + ksft_print_header(); + + if (prereq() || geteuid()) + ksft_exit_skip("Prerequisites unsatisfied\n"); + + ksft_set_plan(1); + + /* Start the test without hugepages reducing mem_free */ + if (set_zero_hugepages(&initial_nr_hugepages)) + ksft_exit_fail(); + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_MEMLOCK, &lim)) + ksft_exit_fail_msg("Failed to set rlimit: %s\n", strerror(errno)); + + page_size = getpagesize(); + + if (read_memory_info(&mem_free, &hugepage_size) != 0) + ksft_exit_fail_msg("Failed to get meminfo\n"); + + mem_fragmentable_MB = mem_free * 0.8 / 1024; + + while (mem_fragmentable_MB > 0) { + map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); + if (map == MAP_FAILED) + break; + + entry = malloc(sizeof(struct map_list)); + if (!entry) { + munmap(map, MAP_SIZE); + break; + } + entry->map = map; + entry->next = list; + list = entry; + + /* Write something (in this case the address of the map) to + * ensure that KSM can't merge the mapped pages + */ + for (i = 0; i < MAP_SIZE; i += page_size) + *(unsigned long *)(map + i) = (unsigned long)map + i; + + mem_fragmentable_MB -= MAP_SIZE_MB; + } + + for (entry = list; entry != NULL; entry = entry->next) { + munmap(entry->map, MAP_SIZE); + if (!entry->next) + break; + entry = entry->next; + } + + if (check_compaction(mem_free, hugepage_size, + initial_nr_hugepages) == 0) + ksft_exit_pass(); + + ksft_exit_fail(); +} diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config new file mode 100644 index 000000000000..4309916f629e --- /dev/null +++ b/tools/testing/selftests/mm/config @@ -0,0 +1,9 @@ +CONFIG_SYSVIPC=y +CONFIG_USERFAULTFD=y +CONFIG_PTE_MARKER_UFFD_WP=y +CONFIG_TEST_VMALLOC=m +CONFIG_DEVICE_PRIVATE=y +CONFIG_TEST_HMM=m +CONFIG_GUP_TEST=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEM_SOFT_DIRTY=y diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c new file mode 100644 index 000000000000..32c6ccc2a6be --- /dev/null +++ b/tools/testing/selftests/mm/cow.c @@ -0,0 +1,1819 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * COW (Copy On Write) tests. + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@redhat.com> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <assert.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <linux/memfd.h> + +#include "local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBURING +#include <liburing.h> +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +#include "../../../../mm/gup_test.h" +#include "../kselftest.h" +#include "vm_util.h" +#include "thp_settings.h" + +static size_t pagesize; +static int pagemap_fd; +static size_t pmdsize; +static int nr_thpsizes; +static size_t thpsizes[20]; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; +static int gup_fd; +static bool has_huge_zeropage; + +static int sz2ord(size_t size) +{ + return __builtin_ctzll(size / pagesize); +} + +static int detect_thp_sizes(size_t sizes[], int max) +{ + int count = 0; + unsigned long orders; + size_t kb; + int i; + + /* thp not supported at all. */ + if (!pmdsize) + return 0; + + orders = 1UL << sz2ord(pmdsize); + orders |= thp_supported_orders(); + + for (i = 0; orders && count < max; i++) { + if (!(orders & (1UL << i))) + continue; + orders &= ~(1UL << i); + kb = (pagesize >> 10) << i; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); + } + + return count; +} + +static void detect_huge_zeropage(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", + O_RDONLY); + size_t enabled = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + enabled = strtoul(buf, NULL, 10); + if (enabled == 1) { + has_huge_zeropage = true; + ksft_print_msg("[INFO] huge zeropage is enabled\n"); + } + } + + close(fd); +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +struct comm_pipes { + int child_ready[2]; + int parent_ready[2]; +}; + +static int setup_comm_pipes(struct comm_pipes *comm_pipes) +{ + if (pipe(comm_pipes->child_ready) < 0) + return -errno; + if (pipe(comm_pipes->parent_ready) < 0) { + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + return -errno; + } + + return 0; +} + +static void close_comm_pipes(struct comm_pipes *comm_pipes) +{ + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + close(comm_pipes->parent_ready[0]); + close(comm_pipes->parent_ready[1]); +} + +static int child_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + char *old = malloc(size); + char buf; + + /* Backup the original content. */ + memcpy(old, mem, size); + + /* Wait until the parent modified the page. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values. */ + return memcmp(old, mem, size); +} + +static int child_vmsplice_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + char *old, *new; + int fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + /* Backup the original content. */ + memcpy(old, mem, size); + + if (pipe(fds) < 0) + return -errno; + + /* Trigger a read-only pin. */ + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred < 0) + return -errno; + if (transferred == 0) + return -EINVAL; + + /* Unmap it from our page tables. */ + if (munmap(mem, size) < 0) + return -errno; + + /* Wait until the parent modified it. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values via the pipe. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) + return -errno; + } + + return memcmp(old, new, transferred); +} + +typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); + +static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, + child_fn fn, bool xfail) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + exit(fn(mem, size, &comm_pipes)); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (do_mprotect) { + /* + * mprotect() optimizations might try avoiding + * write-faults by directly mapping pages writable. + */ + ret = mprotect(mem, size, PROT_READ); + ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + if (!ret) { + ksft_test_result_pass("No leak from parent into child\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from parent into child\n"); + } else { + ksft_test_result_fail("Leak from parent into child\n"); + } +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb) +{ + do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false); +} + +static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb) +{ + do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false); +} + +static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb) +{ + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn, + is_hugetlb); +} + +static void test_vmsplice_in_child_mprotect(char *mem, size_t size, + bool is_hugetlb) +{ + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn, + is_hugetlb); +} + +static void do_test_vmsplice_in_parent(char *mem, size_t size, + bool before_fork, bool xfail) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + struct comm_pipes comm_pipes; + char *old, *new; + int ret, fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + memcpy(old, mem, size); + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free; + } + + if (pipe(fds) < 0) { + ksft_test_result_fail("pipe() failed\n"); + goto close_comm_pipes; + } + + if (before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + goto close_pipe; + } + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_pipe; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + /* Modify page content in the child. */ + memset(mem, 0xff, size); + exit(0); + } + + if (!before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + wait(&ret); + goto close_pipe; + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + if (munmap(mem, size) < 0) { + ksft_test_result_fail("munmap() failed\n"); + goto close_pipe; + } + write(comm_pipes.parent_ready[1], "0", 1); + + /* Wait until the child is done writing. */ + wait(&ret); + if (!WIFEXITED(ret)) { + ksft_test_result_fail("wait() failed\n"); + goto close_pipe; + } + + /* See if we still read the old values. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) { + ksft_test_result_fail("read() failed\n"); + goto close_pipe; + } + } + + if (!memcmp(old, new, transferred)) { + ksft_test_result_pass("No leak from child into parent\n"); + } else if (xfail) { + /* + * With hugetlb, some vmsplice() tests are currently expected to + * fail because (a) harder to fix and (b) nobody really cares. + * Flag them as expected failure for now. + */ + ksft_test_result_xfail("Leak from child into parent\n"); + } else { + ksft_test_result_fail("Leak from child into parent\n"); + } +close_pipe: + close(fds[0]); + close(fds[1]); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free: + free(old); + free(new); +} + +static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb) +{ + do_test_vmsplice_in_parent(mem, size, true, is_hugetlb); +} + +static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb) +{ + do_test_vmsplice_in_parent(mem, size, false, is_hugetlb); +} + +#ifdef LOCAL_CONFIG_HAVE_LIBURING +static void do_test_iouring(char *mem, size_t size, bool use_fork) +{ + struct comm_pipes comm_pipes; + struct io_uring_cqe *cqe; + struct io_uring_sqe *sqe; + struct io_uring ring; + ssize_t cur, total; + struct iovec iov; + char *buf, *tmp; + int ret, fd; + FILE *file; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + goto close_comm_pipes; + } + fd = fileno(file); + assert(fd); + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + goto close_file; + } + + /* Skip on errors, as we might just lack kernel support. */ + ret = io_uring_queue_init(1, &ring, 0); + if (ret < 0) { + ksft_test_result_skip("io_uring_queue_init() failed\n"); + goto free_tmp; + } + + /* + * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN + * | FOLL_LONGTERM the range. + * + * Skip on errors, as we might just lack kernel support or might not + * have sufficient MEMLOCK permissions. + */ + iov.iov_base = mem; + iov.iov_len = size; + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) { + ksft_test_result_skip("io_uring_register_buffers() failed\n"); + goto queue_exit; + } + + if (use_fork) { + /* + * fork() and keep the child alive until we're done. Note that + * we expect the pinned page to not get shared with the child. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto unregister_buffers; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + } else { + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto unregister_buffers; + } + } + + /* + * Modify the page and write page content as observed by the fixed + * buffer pin to the file so we can verify it. + */ + memset(mem, 0xff, size); + sqe = io_uring_get_sqe(&ring); + if (!sqe) { + ksft_test_result_fail("io_uring_get_sqe() failed\n"); + goto quit_child; + } + io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); + + ret = io_uring_submit(&ring); + if (ret < 0) { + ksft_test_result_fail("io_uring_submit() failed\n"); + goto quit_child; + } + + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + ksft_test_result_fail("io_uring_wait_cqe() failed\n"); + goto quit_child; + } + + if (cqe->res != size) { + ksft_test_result_fail("write_fixed failed\n"); + goto quit_child; + } + io_uring_cqe_seen(&ring, cqe); + + /* Read back the file content to the temporary buffer. */ + total = 0; + while (total < size) { + cur = pread(fd, tmp + total, size - total, total); + if (cur < 0) { + ksft_test_result_fail("pread() failed\n"); + goto quit_child; + } + total += cur; + } + + /* Finally, check if we read what we expected. */ + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/W pin is reliable\n"); + +quit_child: + if (use_fork) { + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + } +unregister_buffers: + io_uring_unregister_buffers(&ring); +queue_exit: + io_uring_queue_exit(&ring); +free_tmp: + free(tmp); +close_file: + fclose(file); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb) +{ + do_test_iouring(mem, size, false); +} + +static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb) +{ + do_test_iouring(mem, size, true); +} + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +enum ro_pin_test { + RO_PIN_TEST, + RO_PIN_TEST_SHARED, + RO_PIN_TEST_PREVIOUSLY_SHARED, + RO_PIN_TEST_RO_EXCLUSIVE, +}; + +static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, + bool fast) +{ + struct pin_longterm_test args; + struct comm_pipes comm_pipes; + char *tmp, buf; + __u64 tmp_val; + int ret; + + if (gup_fd < 0) { + ksft_test_result_skip("gup_test not available\n"); + return; + } + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + return; + } + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free_tmp; + } + + switch (test) { + case RO_PIN_TEST: + break; + case RO_PIN_TEST_SHARED: + case RO_PIN_TEST_PREVIOUSLY_SHARED: + /* + * Share the pages with our child. As the pages are not pinned, + * this should just work. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + /* Wait until our child is ready. */ + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { + /* + * Tell the child to quit now and wait until it quit. + * The pages should now be mapped R/O into our page + * tables, but they are no longer shared. + */ + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + } + break; + case RO_PIN_TEST_RO_EXCLUSIVE: + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Take a R/O pin. This should trigger unsharing. */ + args.addr = (__u64)(uintptr_t)mem; + args.size = size; + args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); + if (ret) { + if (errno == EINVAL) + ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); + else + ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); + goto wait; + } + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* + * Read back the content via the pin to the temporary buffer and + * test if we observed the modification. + */ + tmp_val = (__u64)(uintptr_t)tmp; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); + if (ret) + ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); + else + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/O pin is reliable\n"); + + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); + if (ret) + ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); +wait: + switch (test) { + case RO_PIN_TEST_SHARED: + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + break; + default: + break; + } +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free_tmp: + free(tmp); +} + +static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); +} + +static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); +} + +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); +} + +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size, + bool is_hugetlb) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); +} + +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); +} + +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size, + bool is_hugetlb) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); +} + +typedef void (*test_fn)(char *mem, size_t size, bool hugetlb); + +static void do_run_with_base_page(test_fn fn, bool swapout) +{ + char *mem; + int ret; + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); + /* Ignore if not around on a kernel. */ + if (ret && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto munmap; + } + + /* Populate a base page. */ + memset(mem, 0, pagesize); + + if (swapout) { + madvise(mem, pagesize, MADV_PAGEOUT); + if (!pagemap_is_swapped(pagemap_fd, mem)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + } + + fn(mem, pagesize, false); +munmap: + munmap(mem, pagesize); +} + +static void run_with_base_page(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with base page\n", desc); + do_run_with_base_page(fn, false); +} + +static void run_with_base_page_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); + do_run_with_base_page(fn, true); +} + +enum thp_run { + THP_RUN_PMD, + THP_RUN_PMD_SWAPOUT, + THP_RUN_PTE, + THP_RUN_PTE_SWAPOUT, + THP_RUN_SINGLE_PTE, + THP_RUN_SINGLE_PTE_SWAPOUT, + THP_RUN_PARTIAL_MREMAP, + THP_RUN_PARTIAL_SHARED, +}; + +static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) +{ + char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; + size_t size, mmap_size, mremap_size; + int ret; + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Try to populate a THP. Touch the first sub-page and test if + * we get the last sub-page populated automatically. + */ + mem[0] = 0; + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + memset(mem, 0, thpsize); + + size = thpsize; + switch (thp_run) { + case THP_RUN_PMD: + case THP_RUN_PMD_SWAPOUT: + assert(thpsize == pmdsize); + break; + case THP_RUN_PTE: + case THP_RUN_PTE_SWAPOUT: + /* + * Trigger PTE-mapping the THP by temporarily mapping a single + * subpage R/O. This is a noop if the THP is not pmdsize (and + * therefore already PTE-mapped). + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + break; + case THP_RUN_SINGLE_PTE: + case THP_RUN_SINGLE_PTE_SWAPOUT: + /* + * Discard all but a single subpage of that PTE-mapped THP. What + * remains is a single PTE mapping a single subpage. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); + if (ret) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto munmap; + } + size = pagesize; + break; + case THP_RUN_PARTIAL_MREMAP: + /* + * Remap half of the THP. We need some new memory location + * for that. + */ + mremap_size = thpsize / 2; + mremap_mem = mmap(NULL, mremap_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + tmp = mremap(mem + mremap_size, mremap_size, mremap_size, + MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); + if (tmp != mremap_mem) { + ksft_test_result_fail("mremap() failed\n"); + goto munmap; + } + size = mremap_size; + break; + case THP_RUN_PARTIAL_SHARED: + /* + * Share the first page of the THP with a child and quit the + * child. This will result in some parts of the THP never + * have been shared. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto munmap; + } + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto munmap; + } else if (!ret) { + exit(0); + } + wait(&ret); + /* Allow for sharing all pages again. */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + goto munmap; + } + break; + default: + assert(false); + } + + switch (thp_run) { + case THP_RUN_PMD_SWAPOUT: + case THP_RUN_PTE_SWAPOUT: + case THP_RUN_SINGLE_PTE_SWAPOUT: + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + break; + default: + break; + } + + fn(mem, size, false); +munmap: + munmap(mmap_mem, mmap_size); + if (mremap_mem != MAP_FAILED) + munmap(mremap_mem, mremap_size); +} + +static void run_with_thp(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PMD, size); +} + +static void run_with_thp_swap(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size); +} + +static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PTE, size); +} + +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size); +} + +static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size); +} + +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size); +} + +static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size); +} + +static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size) +{ + ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size); +} + +static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) +{ + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + char *mem, *dummy; + + ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; + + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + /* Populate an huge page. */ + memset(mem, 0, hugetlbsize); + + /* + * We need a total of two hugetlb pages to handle COW/unsharing + * properly, otherwise we might get zapped by a SIGBUS. + */ + dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (dummy == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto munmap; + } + munmap(dummy, hugetlbsize); + + fn(mem, hugetlbsize, true); +munmap: + munmap(mem, hugetlbsize); +} + +struct test_case { + const char *desc; + test_fn fn; +}; + +/* + * Test cases that are specific to anonymous pages: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_test_cases[] = { + /* + * Basic COW tests for fork() without any GUP. If we miss to break COW, + * either the child can observe modifications by the parent or the + * other way around. + */ + { + "Basic COW after fork()", + test_cow_in_parent, + }, + /* + * Basic test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "Basic COW after fork() with mprotect() optimization", + test_cow_in_parent_mprotect, + }, + /* + * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If + * we miss to break COW, the child observes modifications by the parent. + * This is CVE-2020-29374 reported by Jann Horn. + */ + { + "vmsplice() + unmap in child", + test_vmsplice_in_child, + }, + /* + * vmsplice() test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "vmsplice() + unmap in child with mprotect() optimization", + test_vmsplice_in_child_mprotect, + }, + /* + * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after + * fork(); modify in the child. If we miss to break COW, the parent + * observes modifications by the child. + */ + { + "vmsplice() before fork(), unmap in parent after fork()", + test_vmsplice_before_fork, + }, + /* + * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the + * child. If we miss to break COW, the parent observes modifications by + * the child. + */ + { + "vmsplice() + unmap in parent after fork()", + test_vmsplice_after_fork, + }, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + /* + * Take a R/W longterm pin and then map the page R/O into the page + * table to trigger a write fault on next access. When modifying the + * page, the page content must be visible via the pin. + */ + { + "R/O-mapping a page registered as iouring fixed buffer", + test_iouring_ro, + }, + /* + * Take a R/W longterm pin and then fork() a child. When modifying the + * page, the page content must be visible via the pin. We expect the + * pinned page to not get shared with the child. + */ + { + "fork() with an iouring fixed buffer", + test_iouring_fork, + }, + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + /* + * Take a R/O longterm pin on a R/O-mapped shared anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped shared page", + test_ro_pin_on_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped shared page", + test_ro_fast_pin_on_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that + * was previously shared. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped previously-shared page", + test_ro_pin_on_ro_previously_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped previously-shared page", + test_ro_fast_pin_on_ro_previously_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped exclusive page", + test_ro_pin_on_ro_exclusive, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped exclusive page", + test_ro_fast_pin_on_ro_exclusive, + }, +}; + +static void run_anon_test_case(struct test_case const *test_case) +{ + int i; + + run_with_base_page(test_case->fn, test_case->desc); + run_with_base_page_swap(test_case->fn, test_case->desc); + for (i = 0; i < nr_thpsizes; i++) { + size_t size = thpsizes[i]; + struct thp_settings settings = *thp_current_settings(); + + settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; + settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + thp_push_settings(&settings); + + if (size == pmdsize) { + run_with_thp(test_case->fn, test_case->desc, size); + run_with_thp_swap(test_case->fn, test_case->desc, size); + } + + run_with_pte_mapped_thp(test_case->fn, test_case->desc, size); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size); + run_with_single_pte_of_thp(test_case->fn, test_case->desc, size); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size); + run_with_partial_mremap_thp(test_case->fn, test_case->desc, size); + run_with_partial_shared_thp(test_case->fn, test_case->desc, size); + + thp_pop_settings(); + } + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) + run_anon_test_case(&anon_test_cases[i]); +} + +static int tests_per_anon_test_case(void) +{ + int tests = 2 + nr_hugetlbsizes; + + tests += 6 * nr_thpsizes; + if (pmdsize) + tests += 2; + return tests; +} + +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size, + bool is_hugetlb) +{ + assert(!is_hugetlb); + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size, + bool is_hugetlb) +{ + assert(!is_hugetlb); + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size, + bool is_hugetlb) +{ + assert(!is_hugetlb); + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size, + bool is_hugetlb) +{ + assert(!is_hugetlb); + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!pmdsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return pmdsize ? 1 : 0; +} + +typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); + +static void test_cow(char *mem, const char *smem, size_t size) +{ + char *old = malloc(size); + + /* Backup the original content. */ + memcpy(old, smem, size); + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* See if we still read the old values via the other mapping. */ + ksft_test_result(!memcmp(smem, old, size), + "Other mapping not modified\n"); + free(old); +} + +static void test_ro_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, false); +} + +static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, true); +} + +static void run_with_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + + ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Read from the page to populate the shared zeropage. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +} + +static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, *mmap_mem, *mmap_smem, tmp; + size_t mmap_size; + int ret; + + ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); + + if (!has_huge_zeropage) { + ksft_test_result_skip("Huge zeropage not enabled\n"); + return; + } + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * pmdsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + mmap_smem = mmap(NULL, mmap_size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_smem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1)); + + ret = madvise(mem, pmdsize, MADV_HUGEPAGE); + ret |= madvise(smem, pmdsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Read from the memory to populate the huge shared zeropage. Read from + * the first sub-page and test if we get another sub-page populated + * automatically. + */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || + !pagemap_is_populated(pagemap_fd, smem + pagesize)) { + ksft_test_result_skip("Did not get THPs populated\n"); + goto munmap; + } + + fn(mem, smem, pmdsize); +munmap: + munmap(mmap_mem, mmap_size); + if (mmap_smem != MAP_FAILED) + munmap(mmap_smem, mmap_size); +} + +static void run_with_memfd(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd\n", desc); + + fd = memfd_create("test", 0); + if (fd < 0) { + ksft_test_result_fail("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + close(fd); +} + +static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + FILE *file; + int fd; + + ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + return; + } + + fd = fileno(file); + if (fd < 0) { + ksft_test_result_skip("fileno() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + fclose(file); +} + +static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, + size_t hugetlbsize) +{ + int flags = MFD_HUGETLB; + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; + + fd = memfd_create("test", flags); + if (fd < 0) { + ksft_test_result_skip("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, hugetlbsize)) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, + 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); + if (mem != MAP_FAILED) + munmap(smem, hugetlbsize); +close: + close(fd); +} + +struct non_anon_test_case { + const char *desc; + non_anon_test_fn fn; +}; + +/* + * Test cases that target any pages in private mappings that are not anonymous: + * pages that may get shared via COW ndependent of fork(). This includes + * the shared zeropage(s), pagecache pages, ... + */ +static const struct non_anon_test_case non_anon_test_cases[] = { + /* + * Basic COW test without any GUP. If we miss to break COW, changes are + * visible via other private/shared mappings. + */ + { + "Basic COW", + test_cow, + }, + /* + * Take a R/O longterm pin. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O longterm GUP pin", + test_ro_pin, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O longterm GUP-fast pin", + test_ro_fast_pin, + }, +}; + +static void run_non_anon_test_case(struct non_anon_test_case const *test_case) +{ + int i; + + run_with_zeropage(test_case->fn, test_case->desc); + run_with_memfd(test_case->fn, test_case->desc); + run_with_tmpfile(test_case->fn, test_case->desc); + if (pmdsize) + run_with_huge_zeropage(test_case->fn, test_case->desc); + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_memfd_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_non_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) + run_non_anon_test_case(&non_anon_test_cases[i]); +} + +static int tests_per_non_anon_test_case(void) +{ + int tests = 3 + nr_hugetlbsizes; + + if (pmdsize) + tests += 1; + return tests; +} + +int main(int argc, char **argv) +{ + int err; + struct thp_settings default_settings; + + ksft_print_header(); + + pagesize = getpagesize(); + pmdsize = read_pmd_pagesize(); + if (pmdsize) { + /* Only if THP is supported. */ + thp_read_settings(&default_settings); + default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; + thp_save_settings(); + thp_push_settings(&default_settings); + + ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", + pmdsize / 1024); + nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); + } + nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, + ARRAY_SIZE(hugetlbsizes)); + detect_huge_zeropage(); + + ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + run_anon_test_cases(); + run_anon_thp_test_cases(); + run_non_anon_test_cases(); + + if (pmdsize) { + /* Only if THP is supported. */ + thp_restore_settings(); + } + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c new file mode 100644 index 000000000000..9423ad439a61 --- /dev/null +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GUP long-term page pinning tests. + * + * Copyright 2023, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@redhat.com> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <assert.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/vfs.h> +#include <linux/magic.h> +#include <linux/memfd.h> + +#include "local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBURING +#include <liburing.h> +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +#include "../../../../mm/gup_test.h" +#include "../kselftest.h" +#include "vm_util.h" + +static size_t pagesize; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; +static int gup_fd; + +static __fsword_t get_fs_type(int fd) +{ + struct statfs fs; + int ret; + + do { + ret = fstatfs(fd, &fs); + } while (ret && errno == EINTR); + + return ret ? 0 : fs.f_type; +} + +static bool fs_is_unknown(__fsword_t fs_type) +{ + /* + * We only support some filesystems in our tests when dealing with + * R/W long-term pinning. For these filesystems, we can be fairly sure + * whether they support it or not. + */ + switch (fs_type) { + case TMPFS_MAGIC: + case HUGETLBFS_MAGIC: + case BTRFS_SUPER_MAGIC: + case EXT4_SUPER_MAGIC: + case XFS_SUPER_MAGIC: + return false; + default: + return true; + } +} + +static bool fs_supports_writable_longterm_pinning(__fsword_t fs_type) +{ + assert(!fs_is_unknown(fs_type)); + switch (fs_type) { + case TMPFS_MAGIC: + case HUGETLBFS_MAGIC: + return true; + default: + return false; + } +} + +enum test_type { + TEST_TYPE_RO, + TEST_TYPE_RO_FAST, + TEST_TYPE_RW, + TEST_TYPE_RW_FAST, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + TEST_TYPE_IOURING, +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ +}; + +static void do_test(int fd, size_t size, enum test_type type, bool shared) +{ + __fsword_t fs_type = get_fs_type(fd); + bool should_work; + char *mem; + int ret; + + if (ftruncate(fd, size)) { + ksft_test_result_fail("ftruncate() failed\n"); + return; + } + + if (fallocate(fd, 0, 0, size)) { + if (size == pagesize) + ksft_test_result_fail("fallocate() failed\n"); + else + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, + shared ? MAP_SHARED : MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + if (size == pagesize || shared) + ksft_test_result_fail("mmap() failed\n"); + else + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + /* Fault in the page such that GUP-fast can pin it directly. */ + memset(mem, 0, size); + + switch (type) { + case TEST_TYPE_RO: + case TEST_TYPE_RO_FAST: + /* + * Cover more cases regarding unsharing decisions when + * long-term R/O pinning by mapping the page R/O. + */ + ret = mprotect(mem, size, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + /* FALLTHROUGH */ + case TEST_TYPE_RW: + case TEST_TYPE_RW_FAST: { + struct pin_longterm_test args; + const bool fast = type == TEST_TYPE_RO_FAST || + type == TEST_TYPE_RW_FAST; + const bool rw = type == TEST_TYPE_RW || + type == TEST_TYPE_RW_FAST; + + if (gup_fd < 0) { + ksft_test_result_skip("gup_test not available\n"); + break; + } + + if (rw && shared && fs_is_unknown(fs_type)) { + ksft_test_result_skip("Unknown filesystem\n"); + return; + } + /* + * R/O pinning or pinning in a private mapping is always + * expected to work. Otherwise, we expect long-term R/W pinning + * to only succeed for special fielesystems. + */ + should_work = !shared || !rw || + fs_supports_writable_longterm_pinning(fs_type); + + args.addr = (__u64)(uintptr_t)mem; + args.size = size; + args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; + args.flags |= rw ? PIN_LONGTERM_TEST_FLAG_USE_WRITE : 0; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); + if (ret && errno == EINVAL) { + ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); + break; + } else if (ret && errno == EFAULT) { + ksft_test_result(!should_work, "Should have failed\n"); + break; + } else if (ret) { + ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); + break; + } + + if (ioctl(gup_fd, PIN_LONGTERM_TEST_STOP)) + ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); + + /* + * TODO: if the kernel ever supports long-term R/W pinning on + * some previously unsupported filesystems, we might want to + * perform some additional tests for possible data corruptions. + */ + ksft_test_result(should_work, "Should have worked\n"); + break; + } +#ifdef LOCAL_CONFIG_HAVE_LIBURING + case TEST_TYPE_IOURING: { + struct io_uring ring; + struct iovec iov; + + /* io_uring always pins pages writable. */ + if (shared && fs_is_unknown(fs_type)) { + ksft_test_result_skip("Unknown filesystem\n"); + return; + } + should_work = !shared || + fs_supports_writable_longterm_pinning(fs_type); + + /* Skip on errors, as we might just lack kernel support. */ + ret = io_uring_queue_init(1, &ring, 0); + if (ret < 0) { + ksft_test_result_skip("io_uring_queue_init() failed\n"); + break; + } + /* + * Register the range as a fixed buffer. This will FOLL_WRITE | + * FOLL_PIN | FOLL_LONGTERM the range. + */ + iov.iov_base = mem; + iov.iov_len = size; + ret = io_uring_register_buffers(&ring, &iov, 1); + /* Only new kernels return EFAULT. */ + if (ret && (errno == ENOSPC || errno == EOPNOTSUPP || + errno == EFAULT)) { + ksft_test_result(!should_work, "Should have failed\n"); + } else if (ret) { + /* + * We might just lack support or have insufficient + * MEMLOCK limits. + */ + ksft_test_result_skip("io_uring_register_buffers() failed\n"); + } else { + ksft_test_result(should_work, "Should have worked\n"); + io_uring_unregister_buffers(&ring); + } + + io_uring_queue_exit(&ring); + break; + } +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + default: + assert(false); + } + +munmap: + munmap(mem, size); +} + +typedef void (*test_fn)(int fd, size_t size); + +static void run_with_memfd(test_fn fn, const char *desc) +{ + int fd; + + ksft_print_msg("[RUN] %s ... with memfd\n", desc); + + fd = memfd_create("test", 0); + if (fd < 0) { + ksft_test_result_fail("memfd_create() failed\n"); + return; + } + + fn(fd, pagesize); + close(fd); +} + +static void run_with_tmpfile(test_fn fn, const char *desc) +{ + FILE *file; + int fd; + + ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + return; + } + + fd = fileno(file); + if (fd < 0) { + ksft_test_result_fail("fileno() failed\n"); + goto close; + } + + fn(fd, pagesize); +close: + fclose(file); +} + +static void run_with_local_tmpfile(test_fn fn, const char *desc) +{ + char filename[] = __FILE__"_tmpfile_XXXXXX"; + int fd; + + ksft_print_msg("[RUN] %s ... with local tmpfile\n", desc); + + fd = mkstemp(filename); + if (fd < 0) { + ksft_test_result_fail("mkstemp() failed\n"); + return; + } + + if (unlink(filename)) { + ksft_test_result_fail("unlink() failed\n"); + goto close; + } + + fn(fd, pagesize); +close: + close(fd); +} + +static void run_with_memfd_hugetlb(test_fn fn, const char *desc, + size_t hugetlbsize) +{ + int flags = MFD_HUGETLB; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; + + fd = memfd_create("test", flags); + if (fd < 0) { + ksft_test_result_skip("memfd_create() failed\n"); + return; + } + + fn(fd, hugetlbsize); + close(fd); +} + +struct test_case { + const char *desc; + test_fn fn; +}; + +static void test_shared_rw_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RW, true); +} + +static void test_shared_rw_fast_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RW_FAST, true); +} + +static void test_shared_ro_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RO, true); +} + +static void test_shared_ro_fast_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RO_FAST, true); +} + +static void test_private_rw_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RW, false); +} + +static void test_private_rw_fast_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RW_FAST, false); +} + +static void test_private_ro_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RO, false); +} + +static void test_private_ro_fast_pin(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_RO_FAST, false); +} + +#ifdef LOCAL_CONFIG_HAVE_LIBURING +static void test_shared_iouring(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_IOURING, true); +} + +static void test_private_iouring(int fd, size_t size) +{ + do_test(fd, size, TEST_TYPE_IOURING, false); +} +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +static const struct test_case test_cases[] = { + { + "R/W longterm GUP pin in MAP_SHARED file mapping", + test_shared_rw_pin, + }, + { + "R/W longterm GUP-fast pin in MAP_SHARED file mapping", + test_shared_rw_fast_pin, + }, + { + "R/O longterm GUP pin in MAP_SHARED file mapping", + test_shared_ro_pin, + }, + { + "R/O longterm GUP-fast pin in MAP_SHARED file mapping", + test_shared_ro_fast_pin, + }, + { + "R/W longterm GUP pin in MAP_PRIVATE file mapping", + test_private_rw_pin, + }, + { + "R/W longterm GUP-fast pin in MAP_PRIVATE file mapping", + test_private_rw_fast_pin, + }, + { + "R/O longterm GUP pin in MAP_PRIVATE file mapping", + test_private_ro_pin, + }, + { + "R/O longterm GUP-fast pin in MAP_PRIVATE file mapping", + test_private_ro_fast_pin, + }, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + { + "io_uring fixed buffer with MAP_SHARED file mapping", + test_shared_iouring, + }, + { + "io_uring fixed buffer with MAP_PRIVATE file mapping", + test_private_iouring, + }, +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ +}; + +static void run_test_case(struct test_case const *test_case) +{ + int i; + + run_with_memfd(test_case->fn, test_case->desc); + run_with_tmpfile(test_case->fn, test_case->desc); + run_with_local_tmpfile(test_case->fn, test_case->desc); + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_memfd_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static int tests_per_test_case(void) +{ + return 3 + nr_hugetlbsizes; +} + +int main(int argc, char **argv) +{ + int i, err; + + pagesize = getpagesize(); + nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, + ARRAY_SIZE(hugetlbsizes)); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(test_cases) * tests_per_test_case()); + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_test_case(&test_cases[i]); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c new file mode 100644 index 000000000000..bdeaac67ff9a --- /dev/null +++ b/tools/testing/selftests/mm/gup_test.c @@ -0,0 +1,272 @@ +#define __SANE_USERSPACE_TYPES__ // Use ll64 +#include <fcntl.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <dirent.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <pthread.h> +#include <assert.h> +#include <mm/gup_test.h> +#include "../kselftest.h" +#include "vm_util.h" + +#define MB (1UL << 20) + +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ + +#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" + +static unsigned long cmd = GUP_FAST_BENCHMARK; +static int gup_fd, repeats = 1; +static unsigned long size = 128 * MB; +/* Serialize prints */ +static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; + +static char *cmd_to_str(unsigned long cmd) +{ + switch (cmd) { + case GUP_FAST_BENCHMARK: + return "GUP_FAST_BENCHMARK"; + case PIN_FAST_BENCHMARK: + return "PIN_FAST_BENCHMARK"; + case PIN_LONGTERM_BENCHMARK: + return "PIN_LONGTERM_BENCHMARK"; + case GUP_BASIC_TEST: + return "GUP_BASIC_TEST"; + case PIN_BASIC_TEST: + return "PIN_BASIC_TEST"; + case DUMP_USER_PAGES_TEST: + return "DUMP_USER_PAGES_TEST"; + } + return "Unknown command"; +} + +void *gup_thread(void *data) +{ + struct gup_test gup = *(struct gup_test *)data; + int i, status; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + status = ioctl(gup_fd, cmd, &gup); + if (status) + break; + + pthread_mutex_lock(&print_mutex); + ksft_print_msg("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + ksft_print_msg(", truncated (size: %lld)", gup.size); + ksft_print_msg("\n"); + pthread_mutex_unlock(&print_mutex); + } + } else { + gup.size = size; + status = ioctl(gup_fd, cmd, &gup); + if (status) + goto return_; + + pthread_mutex_lock(&print_mutex); + ksft_print_msg("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + ksft_print_msg("Truncated (size: %lld)\n", gup.size); + pthread_mutex_unlock(&print_mutex); + } + +return_: + ksft_test_result(!status, "ioctl status %d\n", status); + return NULL; +} + +int main(int argc, char **argv) +{ + struct gup_test gup = { 0 }; + int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; + int flags = MAP_PRIVATE, touch = 0; + char *file = "/dev/zero"; + pthread_t *tid; + char *p; + + while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { + switch (opt) { + case 'a': + cmd = PIN_FAST_BENCHMARK; + break; + case 'b': + cmd = PIN_BASIC_TEST; + break; + case 'L': + cmd = PIN_LONGTERM_BENCHMARK; + break; + case 'c': + cmd = DUMP_USER_PAGES_TEST; + /* + * Dump page 0 (index 1). May be overridden later, by + * user's non-option arguments. + * + * .which_pages is zero-based, so that zero can mean "do + * nothing". + */ + gup.which_pages[0] = 1; + break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; + case 'F': + /* strtol, so you can pass flags in hex form */ + gup.gup_flags = strtol(optarg, 0, 0); + break; + case 'j': + nthreads = atoi(optarg); + break; + case 'm': + size = atoi(optarg) * MB; + break; + case 'r': + repeats = atoi(optarg); + break; + case 'n': + nr_pages = atoi(optarg); + break; + case 't': + thp = 1; + break; + case 'T': + thp = 0; + break; + case 'U': + cmd = GUP_BASIC_TEST; + break; + case 'u': + cmd = GUP_FAST_BENCHMARK; + break; + case 'w': + write = 1; + break; + case 'W': + write = 0; + break; + case 'f': + file = optarg; + break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; + case 'H': + flags |= (MAP_HUGETLB | MAP_ANONYMOUS); + break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; + default: + ksft_exit_fail_msg("Wrong argument\n"); + } + } + + if (optind < argc) { + int extra_arg_count = 0; + /* + * For example: + * + * ./gup_test -c 0 1 0x1001 + * + * ...to dump pages 0, 1, and 4097 + */ + + while ((optind < argc) && + (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { + /* + * Do the 1-based indexing here, so that the user can + * use normal 0-based indexing on the command line. + */ + long page_index = strtol(argv[optind], 0, 0) + 1; + + gup.which_pages[extra_arg_count] = page_index; + extra_arg_count++; + optind++; + } + } + + ksft_print_header(); + ksft_set_plan(nthreads); + + filed = open(file, O_RDWR|O_CREAT, 0664); + if (filed < 0) + ksft_exit_fail_msg("Unable to open %s: %s\n", file, strerror(errno)); + + gup.nr_pages_per_call = nr_pages; + if (write) + gup.gup_flags |= FOLL_WRITE; + + gup_fd = open(GUP_TEST_FILE, O_RDWR); + if (gup_fd == -1) { + switch (errno) { + case EACCES: + if (getuid()) + ksft_print_msg("Please run this test as root\n"); + break; + case ENOENT: + if (opendir("/sys/kernel/debug") == NULL) + ksft_print_msg("mount debugfs at /sys/kernel/debug\n"); + ksft_print_msg("check if CONFIG_GUP_TEST is enabled in kernel config\n"); + break; + default: + ksft_print_msg("failed to open %s: %s\n", GUP_TEST_FILE, strerror(errno)); + break; + } + ksft_test_result_skip("Please run this test as root\n"); + ksft_exit_pass(); + } + + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); + if (p == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + gup.addr = (unsigned long)p; + + if (thp == 1) + madvise(p, size, MADV_HUGEPAGE); + else if (thp == 0) + madvise(p, size, MADV_NOHUGEPAGE); + + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += psize()) + p[0] = 0; + } + + tid = malloc(sizeof(pthread_t) * nthreads); + assert(tid); + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&tid[i], NULL, gup_thread, &gup); + assert(ret == 0); + } + for (i = 0; i < nthreads; i++) { + ret = pthread_join(tid[i], NULL); + assert(ret == 0); + } + + free(tid); + + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c new file mode 100644 index 000000000000..d2cfc9b494a0 --- /dev/null +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -0,0 +1,2059 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HMM stands for Heterogeneous Memory Management, it is a helper layer inside + * the linux kernel to help device drivers mirror a process address space in + * the device. This allows the device to use the same address space which + * makes communication and data exchange a lot easier. + * + * This framework's sole purpose is to exercise various code paths inside + * the kernel to make sure that HMM performs as expected and to flush out any + * bugs. + */ + +#include "../kselftest_harness.h" + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <strings.h> +#include <time.h> +#include <pthread.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/ioctl.h> + + +/* + * This is a private UAPI to the kernel test module so it isn't exported + * in the usual include/uapi/... directory. + */ +#include <lib/test_hmm_uapi.h> +#include <mm/gup_test.h> + +struct hmm_buffer { + void *ptr; + void *mirror; + unsigned long size; + int fd; + uint64_t cpages; + uint64_t faults; +}; + +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, +}; + +#define TWOMEG (1 << 21) +#define HMM_BUFFER_SIZE (1024 << 12) +#define HMM_PATH_MAX 64 +#define NTIMES 10 + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +/* Just the flags we need, copied from mm.h: */ + +#ifndef FOLL_WRITE +#define FOLL_WRITE 0x01 /* check pte is writable */ +#endif + +#ifndef FOLL_LONGTERM +#define FOLL_LONGTERM 0x100 /* mapping lifetime is indefinite */ +#endif +FIXTURE(hmm) +{ + int fd; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + +FIXTURE(hmm2) +{ + int fd0; + int fd1; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + +static int hmm_open(int unit) +{ + char pathname[HMM_PATH_MAX]; + int fd; + + snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); + fd = open(pathname, O_RDWR, 0); + if (fd < 0) + fprintf(stderr, "could not open hmm dmirror driver (%s)\n", + pathname); + return fd; +} + +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + +FIXTURE_SETUP(hmm) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(return, "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd, 0); +} + +FIXTURE_SETUP(hmm2) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(return, "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd0, 0); + self->fd1 = hmm_open(variant->device_number1); + ASSERT_GE(self->fd1, 0); +} + +FIXTURE_TEARDOWN(hmm) +{ + int ret = close(self->fd); + + ASSERT_EQ(ret, 0); + self->fd = -1; +} + +FIXTURE_TEARDOWN(hmm2) +{ + int ret = close(self->fd0); + + ASSERT_EQ(ret, 0); + self->fd0 = -1; + + ret = close(self->fd1); + ASSERT_EQ(ret, 0); + self->fd1 = -1; +} + +static int hmm_dmirror_cmd(int fd, + unsigned long request, + struct hmm_buffer *buffer, + unsigned long npages) +{ + struct hmm_dmirror_cmd cmd; + int ret; + + /* Simulate a device reading system memory. */ + cmd.addr = (__u64)buffer->ptr; + cmd.ptr = (__u64)buffer->mirror; + cmd.npages = npages; + + for (;;) { + ret = ioctl(fd, request, &cmd); + if (ret == 0) + break; + if (errno == EINTR) + continue; + return -errno; + } + buffer->cpages = cmd.cpages; + buffer->faults = cmd.faults; + + return 0; +} + +static void hmm_buffer_free(struct hmm_buffer *buffer) +{ + if (buffer == NULL) + return; + + if (buffer->ptr) + munmap(buffer->ptr, buffer->size); + free(buffer->mirror); + free(buffer); +} + +/* + * Create a temporary file that will be deleted on close. + */ +static int hmm_create_file(unsigned long size) +{ + char path[HMM_PATH_MAX]; + int fd; + + strcpy(path, "/tmp"); + fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); + if (fd >= 0) { + int r; + + do { + r = ftruncate(fd, size); + } while (r == -1 && errno == EINTR); + if (!r) + return fd; + close(fd); + } + return -1; +} + +/* + * Return a random unsigned number. + */ +static unsigned int hmm_random(void) +{ + static int fd = -1; + unsigned int r; + + if (fd < 0) { + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s:%d failed to open /dev/urandom\n", + __FILE__, __LINE__); + return ~0U; + } + } + read(fd, &r, sizeof(r)); + return r; +} + +static void hmm_nanosleep(unsigned int n) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = n; + nanosleep(&t, NULL); +} + +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + +/* + * Simple NULL test of device open/close. + */ +TEST_F(hmm, open_close) +{ +} + +/* + * Read private anonymous memory. + */ +TEST_F(hmm, anon_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int val; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* + * Initialize buffer in system memory but leave the first two pages + * zero (pte_none and pfn_zero). + */ + i = 2 * self->page_size / sizeof(*ptr); + for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Set buffer permission to read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Populate the CPU page table with a special zero page. */ + val = *(int *)(buffer->ptr + self->page_size); + ASSERT_EQ(val, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + ptr = buffer->mirror; + for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + for (; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Read private anonymous memory which has been protected with + * mprotect() PROT_NONE. + */ +TEST_F(hmm, anon_read_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize mirror buffer so we can verify it isn't written. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + /* Protect buffer from reading. */ + ret = mprotect(buffer->ptr, size, PROT_NONE); + ASSERT_EQ(ret, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, -EFAULT); + + /* Allow CPU to read the buffer so we can check it. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory. + */ +TEST_F(hmm, anon_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory which has been protected with + * mprotect() PROT_READ. + */ +TEST_F(hmm, anon_write_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading a zero page of memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + ASSERT_EQ(buffer->faults, 1); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Now allow writing and see that the zero page is replaced. */ + ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Check that a device writing an anonymous private mapping + * will copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Check that a device writing an anonymous shared mapping + * will not copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Write private anonymous huge page. + */ +TEST_F(hmm, anon_write_huge) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = 2 * TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + size = TWOMEG; + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +static long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[2048]; + int len; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +/* + * Write huge TLBFS page. + */ +TEST_F(hmm, anon_write_hugetlbfs) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + unsigned long i; + int *ptr; + int ret; + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + SKIP(return, "Huge page could not be allocated"); + } + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Read mmap'ed file memory. + */ +TEST_F(hmm, file_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Write initial contents of the file. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + len = pwrite(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + memset(buffer->mirror, 0, size); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write mmap'ed file memory. + */ +TEST_F(hmm, file_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check that the device also wrote the file. */ + len = pread(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory. + */ +TEST_F(hmm, migrate) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault some of it back + * to system memory, then try migrating the resulting mix of system and device + * private memory to the device. + */ +TEST_F(hmm, migrate_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault half the pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate memory to the device again. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous shared memory to device private memory. + */ +TEST_F(hmm, migrate_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, -ENOENT); + + hmm_buffer_free(buffer); +} + +/* + * Try to migrate various memory types to device private memory. + */ +TEST_F(hmm2, migrate_mixed) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + int ret; + int val; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Migrating a protected area should be an error. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, -EINVAL); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* We expect an error if the vma doesn't cover the range. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); + ASSERT_EQ(ret, -EINVAL); + + /* Page 2 will be a read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-5 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + ptr = (int *)(buffer->ptr + 5 * self->page_size); + *ptr = val; + + /* Now try to migrate pages 2-5 to device 1. */ + buffer->ptr = p + 2 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 4); + + /* Page 5 won't be migrated to device 0 because it's on device 1. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, -ENOENT); + buffer->ptr = p; + + buffer->ptr = p; + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. + */ +TEST_F(hmm, migrate_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); + } +} + +/* + * Read anonymous memory multiple times. + */ +TEST_F(hmm, anon_read_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + c); + + hmm_buffer_free(buffer); + } +} + +void *unmap_buffer(void *p) +{ + struct hmm_buffer *buffer = p; + + /* Delay for a bit and then unmap buffer while it is being read. */ + hmm_nanosleep(hmm_random() % 32000); + munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); + buffer->ptr = NULL; + + return NULL; +} + +/* + * Try reading anonymous memory while it is being unmapped. + */ +TEST_F(hmm, anon_teardown) +{ + unsigned long npages; + unsigned long size; + unsigned long c; + void *ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; ++c) { + pthread_t thread; + struct hmm_buffer *buffer; + unsigned long i; + int *ptr; + int rc; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + rc = pthread_create(&thread, NULL, unmap_buffer, buffer); + ASSERT_EQ(rc, 0); + + /* Simulate a device reading system memory. */ + rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + if (rc == 0) { + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); + ++i) + ASSERT_EQ(ptr[i], i + c); + } + + pthread_join(thread, &ret); + hmm_buffer_free(buffer); + } +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm2, snapshot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + unsigned char *m; + int ret; + int val; + + npages = 7; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* Page 2 will be read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-6 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + + /* Page 5 will be migrated to device 0. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Page 6 will be migrated to device 1. */ + buffer->ptr = p + 6 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Simulate a device snapshotting CPU pagetables. */ + buffer->ptr = p; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } + + hmm_buffer_free(buffer); +} + +/* + * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that + * should be mapped by a large page table entry. + */ +TEST_F(hmm, compound) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + int *ptr; + unsigned char *m; + int ret; + unsigned long i; + + /* Skip test if we can't allocate a hugetlbfs page. */ + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + return; + } + + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize the pages the device will snapshot in buffer->ptr. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | + HMM_DMIRROR_PROT_PMD); + + /* Make the region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | + HMM_DMIRROR_PROT_PMD); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Test two devices reading the same memory (double mapped). + */ +TEST_F(hmm2, double_map) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Make region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate device 0 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Simulate device 1 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate pages to device 1 and try to read from device 0. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what device 0 read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Basic check of exclusive faulting. + */ +TEST_F(hmm, exclusive) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + /* Check atomic access revoked */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, exclusive_mprotect) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + hmm_buffer_free(buffer); +} + +/* + * Check copy-on-write works. + */ +TEST_F(hmm, exclusive_cow) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + fork(); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + hmm_buffer_free(buffer); +} + +static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, + int npages, int size, int flags) +{ + struct gup_test gup = { + .nr_pages_per_call = npages, + .addr = addr, + .gup_flags = FOLL_WRITE | flags, + .size = size, + }; + + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl on error\n"); + return errno; + } + + return 0; +} + +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr, + GUP_BASIC_TEST, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 1 * self->page_size, + GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 2 * self->page_size, + PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 3 * self->page_size, + PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + if (hmm_is_coherent_type(variant->device_number)) { + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); + } else { + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); + } + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); + /* + * Check again the content on the pages. Make sure there's no + * corrupted data. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + close(gup_fd); + hmm_buffer_free(buffer); +} + +/* + * Test copy-on-write in device pages. + * In case of writing to COW private page(s), a page fault will migrate pages + * back to system memory first. Then, these pages will be duplicated. In case + * of COW device coherent type, pages are duplicated directly from device + * memory. + */ +TEST_F(hmm, hmm_cow_in_device) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + pid_t pid; + int status; + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (!pid) { + /* Child process waitd for SIGTERM from the parent. */ + while (1) { + } + perror("Should not reach this\n"); + exit(0); + } + /* Parent process writes to COW pages(s) and gets a + * new copy in system. In case of device private pages, + * this write causes a migration to system mem first. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Terminate child and wait */ + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); + + hmm_buffer_free(buffer); +} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c index 93f9e7b81331..267eea2e0e0b 100644 --- a/tools/testing/selftests/vm/hugepage-mmap.c +++ b/tools/testing/selftests/mm/hugepage-mmap.c @@ -16,14 +16,14 @@ * range. * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. */ - +#define _GNU_SOURCE #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <sys/mman.h> #include <fcntl.h> +#include "../kselftest.h" -#define FILE_NAME "huge/hugepagefile" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) @@ -38,7 +38,7 @@ static void check_bytes(char *addr) { - printf("First hex is %x\n", *((unsigned int *)addr)); + ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); } static void write_bytes(char *addr) @@ -56,7 +56,7 @@ static int read_bytes(char *addr) check_bytes(addr); for (i = 0; i < LENGTH; i++) if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); + ksft_print_msg("Error: Mismatch at %lu\n", i); return 1; } return 0; @@ -67,27 +67,28 @@ int main(void) void *addr; int fd, ret; - fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); - if (fd < 0) { - perror("Open failed"); - exit(1); - } + ksft_print_header(); + ksft_set_plan(1); + + fd = memfd_create("hugepage-mmap", MFD_HUGETLB); + if (fd < 0) + ksft_exit_fail_msg("memfd_create() failed: %s\n", strerror(errno)); addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); if (addr == MAP_FAILED) { - perror("mmap"); - unlink(FILE_NAME); - exit(1); + close(fd); + ksft_exit_fail_msg("mmap(): %s\n", strerror(errno)); } - printf("Returned address is %p\n", addr); + ksft_print_msg("Returned address is %p\n", addr); check_bytes(addr); write_bytes(addr); ret = read_bytes(addr); munmap(addr, LENGTH); close(fd); - unlink(FILE_NAME); - return ret; + ksft_test_result(!ret, "Read same data\n"); + + ksft_exit(!ret); } diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c new file mode 100644 index 000000000000..c463d1c09c9b --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mremap: + * + * Example of remapping huge page memory in a user application using the + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. + * + * To make sure the test triggers pmd sharing and goes through the 'unshare' + * path in the mremap code use 1GB (1024) or more. + */ + +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <errno.h> +#include <fcntl.h> /* Definition of O_* constants */ +#include <sys/syscall.h> /* Definition of SYS_* constants */ +#include <linux/userfaultfd.h> +#include <sys/ioctl.h> +#include <string.h> +#include <stdbool.h> +#include "../kselftest.h" +#include "vm_util.h" + +#define DEFAULT_LENGTH_MB 10UL +#define MB_TO_BYTES(x) (x * 1024 * 1024) + +#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) +#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) + +static void check_bytes(char *addr) +{ + ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t len) +{ + unsigned long i; + + for (i = 0; i < len; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t len) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < len; i++) + if (*(addr + i) != (char)i) { + ksft_print_msg("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +static void register_region_with_uffd(char *addr, size_t len) +{ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) + ksft_exit_fail_msg("userfaultfd: %s\n", strerror(errno)); + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) + ksft_exit_fail_msg("ioctl-UFFDIO_API: %s\n", strerror(errno)); + + /* Create a private anonymous mapping. The memory will be + * demand-zero paged--that is, not yet allocated. When we + * actually touch the memory, it will be allocated via + * the userfaultfd. + */ + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + + ksft_print_msg("Address returned by mmap() = %p\n", addr); + + /* Register the memory range of the mapping we just created for + * handling by the userfaultfd object. In mode, we request to track + * missing pages (i.e., pages that have not yet been faulted in). + */ + if (uffd_register(uffd, addr, len, true, false, false)) + ksft_exit_fail_msg("ioctl-UFFDIO_REGISTER: %s\n", strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + size_t length = 0; + int ret = 0, fd; + + ksft_print_header(); + ksft_set_plan(1); + + if (argc >= 2 && !strcmp(argv[1], "-h")) + ksft_exit_fail_msg("Usage: %s [length_in_MB]\n", argv[0]); + + /* Read memory length as the first arg if valid, otherwise fallback to + * the default length. + */ + if (argc >= 2) + length = (size_t)atoi(argv[1]); + else + length = DEFAULT_LENGTH_MB; + + length = MB_TO_BYTES(length); + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) + ksft_exit_fail_msg("Open failed: %s\n", strerror(errno)); + + /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ + unsigned long suggested_addr = 0x7eaa40000000; + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + ksft_print_msg("Map haddr: Returned address is %p\n", haddr); + if (haddr == MAP_FAILED) + ksft_exit_fail_msg("mmap1: %s\n", strerror(errno)); + + /* mmap again to a dummy address to hopefully trigger pmd sharing. */ + suggested_addr = 0x7daa40000000; + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + ksft_print_msg("Map daddr: Returned address is %p\n", daddr); + if (daddr == MAP_FAILED) + ksft_exit_fail_msg("mmap3: %s\n", strerror(errno)); + + suggested_addr = 0x7faa40000000; + void *vaddr = + mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); + ksft_print_msg("Map vaddr: Returned address is %p\n", vaddr); + if (vaddr == MAP_FAILED) + ksft_exit_fail_msg("mmap2: %s\n", strerror(errno)); + + register_region_with_uffd(haddr, length); + + void *addr = mremap(haddr, length, length, + MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mremap: %s\n", strerror(errno)); + + ksft_print_msg("Mremap: Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + munmap(addr, length); + + addr = mremap(addr, length, length, 0); + if (addr != MAP_FAILED) + ksft_exit_fail_msg("mremap: Expected failure, but call succeeded\n"); + + close(fd); + + ksft_test_result(!ret, "Read same data\n"); + ksft_exit(!ret); +} diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c index e2527f32005b..478bb1e989e9 100644 --- a/tools/testing/selftests/vm/hugepage-shm.c +++ b/tools/testing/selftests/mm/hugepage-shm.c @@ -35,10 +35,6 @@ #include <sys/shm.h> #include <sys/mman.h> -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - #define LENGTH (256UL*1024*1024) #define dprintf(x) printf(x) diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c new file mode 100644 index 000000000000..894d28c3dd47 --- /dev/null +++ b/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> +#include "vm_util.h" + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static size_t pagesize; +static size_t maplength; + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / pagesize * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < maplength / pagesize; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + pagesize = psize(); + maplength = default_huge_page_size(); + if (!maplength) { + printf("Unable to determine huge page size\n"); + exit(1); + } + + addr = mmap(MAP_ADDR, maplength, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, maplength); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, maplength); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, maplength); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, maplength)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c new file mode 100644 index 000000000000..e74107185324 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-madvise: + * + * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE + * on hugetlb mappings. + * + * Before running this test, make sure the administrator has pre-allocated + * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, + * the test takes an argument that is the path to a file in a hugetlbfs + * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some + * directory. + */ + +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> +#include "vm_util.h" +#include "../kselftest.h" + +#define MIN_FREE_PAGES 20 +#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ + +#define validate_free_pages(exp_free) \ + do { \ + int fhp = get_free_hugepages(); \ + if (fhp != (exp_free)) { \ + printf("Unexpected number of free huge " \ + "pages line %d\n", __LINE__); \ + exit(1); \ + } \ + } while (0) + +unsigned long huge_page_size; +unsigned long base_page_size; + +void write_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(addr + (i * huge_page_size))) = i; +} + +void read_fault_pages(void *addr, unsigned long nr_pages) +{ + volatile unsigned long dummy = 0; + unsigned long i; + + for (i = 0; i < nr_pages; i++) { + dummy += *((unsigned long *)(addr + (i * huge_page_size))); + + /* Prevent the compiler from optimizing out the entire loop: */ + asm volatile("" : "+r" (dummy)); + } +} + +int main(int argc, char **argv) +{ + unsigned long free_hugepages; + void *addr, *addr2; + int fd; + int ret; + + huge_page_size = default_huge_page_size(); + if (!huge_page_size) { + printf("Unable to determine huge page size, exiting!\n"); + exit(1); + } + base_page_size = sysconf(_SC_PAGE_SIZE); + if (!huge_page_size) { + printf("Unable to determine base page size, exiting!\n"); + exit(1); + } + + free_hugepages = get_free_hugepages(); + if (free_hugepages < MIN_FREE_PAGES) { + printf("Not enough free huge pages to test, exiting!\n"); + exit(KSFT_SKIP); + } + + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + /* + * Test validity of MADV_DONTNEED addr and length arguments. mmap + * size is NR_HUGE_PAGES + 2. One page at the beginning and end of + * the mapping will be unmapped so we KNOW there is nothing mapped + * there. + */ + addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + if (munmap(addr, huge_page_size) || + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, + huge_page_size)) { + perror("munmap"); + exit(1); + } + addr = addr + huge_page_size; + + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr before mapping should fail */ + ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid addr line %d\n", + __LINE__); + exit(1); + } + + /* addr + length after mapping should fail */ + ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid length line %d\n", + __LINE__); + exit(1); + } + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test alignment of MADV_DONTNEED addr and length arguments + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr is not huge page size aligned and should fail */ + ret = madvise(addr + base_page_size, + NR_HUGE_PAGES * huge_page_size - base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with unaligned start address %d\n", + __LINE__); + exit(1); + } + + /* addr + length should be aligned down to huge page size */ + if (madvise(addr, + ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all but last page in mapping */ + validate_free_pages(free_hugepages - 1); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + validate_free_pages(free_hugepages); + + /* + * Test MADV_DONTNEED on anonymous private mapping + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* read should not consume any pages */ + read_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise should free private pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * The fallocate below certainly should free the pages associated + * with the file. However, pages in the private mapping are also + * freed. This is not the 'correct' behavior, but is expected + * because this is how it has worked since the initial hugetlb + * implementation. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on shared mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* write should not consume any pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* + * Test MADV_REMOVE on shared mapping of hugetlb file + * + * madvise is same as hole punch and should free all pages. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_REMOVE on shared and private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* shared write should not consume any additional pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* private read should not consume any pages */ + read_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of shared mapping should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of private mapping should free private pages */ + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages again */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * madvise should free both file and private pages although this is + * not correct. private pages should not be freed, but this is + * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); + + close(fd); + return 0; +} diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c new file mode 100644 index 000000000000..ba6cc6f9cabc --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <linux/magic.h> +#include <sys/mman.h> +#include <sys/statfs.h> +#include <errno.h> +#include <stdbool.h> + +#include "../kselftest.h" + +#define PREFIX " ... " +#define ERROR_PREFIX " !!! " + +#define MAX_WRITE_READ_CHUNK_SIZE (getpagesize() * 16) +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +enum test_status { + TEST_PASSED = 0, + TEST_FAILED = 1, + TEST_SKIPPED = 2, +}; + +static char *status_to_str(enum test_status status) +{ + switch (status) { + case TEST_PASSED: + return "TEST_PASSED"; + case TEST_FAILED: + return "TEST_FAILED"; + case TEST_SKIPPED: + return "TEST_SKIPPED"; + default: + return "TEST_???"; + } +} + +static int setup_filemap(char *filemap, size_t len, size_t wr_chunk_size) +{ + char iter = 0; + + for (size_t offset = 0; offset < len; + offset += wr_chunk_size) { + iter++; + memset(filemap + offset, iter, wr_chunk_size); + } + + return 0; +} + +static bool verify_chunk(char *buf, size_t len, char val) +{ + size_t i; + + for (i = 0; i < len; ++i) { + if (buf[i] != val) { + printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n", + i, buf[i], val); + return false; + } + } + + return true; +} + +static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, + off_t offset, size_t expected) +{ + char buf[MAX_WRITE_READ_CHUNK_SIZE]; + ssize_t ret_count = 0; + ssize_t total_ret_count = 0; + char val = offset / wr_chunk_size + offset % wr_chunk_size; + + printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); + printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", + expected); + if (lseek(fd, offset, SEEK_SET) < 0) { + perror(PREFIX ERROR_PREFIX "seek failed"); + return false; + } + + while (offset + total_ret_count < len) { + ret_count = read(fd, buf, wr_chunk_size); + if (ret_count == 0) { + printf(PREFIX PREFIX "read reach end of the file\n"); + break; + } else if (ret_count < 0) { + perror(PREFIX ERROR_PREFIX "read failed"); + break; + } + ++val; + if (!verify_chunk(buf, ret_count, val)) + return false; + + total_ret_count += ret_count; + } + printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", + total_ret_count); + + return total_ret_count == expected; +} + +static bool read_hugepage_filemap(int fd, size_t len, + size_t wr_chunk_size, size_t expected) +{ + char buf[MAX_WRITE_READ_CHUNK_SIZE]; + ssize_t ret_count = 0; + ssize_t total_ret_count = 0; + char val = 0; + + printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", + expected); + while (total_ret_count < len) { + ret_count = read(fd, buf, wr_chunk_size); + if (ret_count == 0) { + printf(PREFIX PREFIX "read reach end of the file\n"); + break; + } else if (ret_count < 0) { + perror(PREFIX ERROR_PREFIX "read failed"); + break; + } + ++val; + if (!verify_chunk(buf, ret_count, val)) + return false; + + total_ret_count += ret_count; + } + printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", + total_ret_count); + + return total_ret_count == expected; +} + +static enum test_status +test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size) +{ + enum test_status status = TEST_SKIPPED; + char *filemap = NULL; + + if (ftruncate(fd, len) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate failed"); + return status; + } + + filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (filemap == MAP_FAILED) { + perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + goto done; + } + + setup_filemap(filemap, len, wr_chunk_size); + status = TEST_FAILED; + + if (read_hugepage_filemap(fd, len, wr_chunk_size, len)) + status = TEST_PASSED; + + munmap(filemap, len); +done: + if (ftruncate(fd, 0) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + status = TEST_FAILED; + } + + return status; +} + +static enum test_status +test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, + bool skip_hwpoison_page) +{ + enum test_status status = TEST_SKIPPED; + char *filemap = NULL; + char *hwp_addr = NULL; + const unsigned long pagesize = getpagesize(); + + if (ftruncate(fd, len) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate failed"); + return status; + } + + filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (filemap == MAP_FAILED) { + perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + goto done; + } + + setup_filemap(filemap, len, wr_chunk_size); + status = TEST_FAILED; + + /* + * Poisoned hugetlb page layout (assume hugepagesize=2MB): + * |<---------------------- 1MB ---------------------->| + * |<---- healthy page ---->|<---- HWPOISON page ----->| + * |<------------------- (1MB - 8KB) ----------------->| + */ + hwp_addr = filemap + len / 2 + pagesize; + if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) { + perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed"); + goto unmap; + } + + if (!skip_hwpoison_page) { + /* + * Userspace should be able to read (1MB + 1 page) from + * the beginning of the HWPOISONed hugepage. + */ + if (read_hugepage_filemap(fd, len, wr_chunk_size, + len / 2 + pagesize)) + status = TEST_PASSED; + } else { + /* + * Userspace should be able to read (1MB - 2 pages) from + * HWPOISONed hugepage. + */ + if (seek_read_hugepage_filemap(fd, len, wr_chunk_size, + len / 2 + MAX(2 * pagesize, wr_chunk_size), + len / 2 - MAX(2 * pagesize, wr_chunk_size))) + status = TEST_PASSED; + } + +unmap: + munmap(filemap, len); +done: + if (ftruncate(fd, 0) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + status = TEST_FAILED; + } + + return status; +} + +static int create_hugetlbfs_file(struct statfs *file_stat) +{ + int fd; + + fd = memfd_create("hugetlb_tmp", MFD_HUGETLB); + if (fd < 0) { + perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file"); + return -1; + } + + memset(file_stat, 0, sizeof(*file_stat)); + if (fstatfs(fd, file_stat)) { + perror(PREFIX ERROR_PREFIX "fstatfs failed"); + goto close; + } + if (file_stat->f_type != HUGETLBFS_MAGIC) { + printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n"); + goto close; + } + + return fd; +close: + close(fd); + return -1; +} + +int main(void) +{ + int fd; + struct statfs file_stat; + enum test_status status; + /* Test read() in different granularity. */ + size_t wr_chunk_sizes[] = { + getpagesize() / 2, getpagesize(), + getpagesize() * 2, getpagesize() * 4 + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) { + printf("Write/read chunk size=0x%lx\n", + wr_chunk_sizes[i]); + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + goto create_failure; + printf(PREFIX "HugeTLB read regression test...\n"); + status = test_hugetlb_read(fd, file_stat.f_bsize, + wr_chunk_sizes[i]); + printf(PREFIX "HugeTLB read regression test...%s\n", + status_to_str(status)); + close(fd); + if (status == TEST_FAILED) + return -1; + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + goto create_failure; + printf(PREFIX "HugeTLB read HWPOISON test...\n"); + status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, + wr_chunk_sizes[i], false); + printf(PREFIX "HugeTLB read HWPOISON test...%s\n", + status_to_str(status)); + close(fd); + if (status == TEST_FAILED) + return -1; + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + goto create_failure; + printf(PREFIX "HugeTLB seek then read HWPOISON test...\n"); + status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, + wr_chunk_sizes[i], true); + printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n", + status_to_str(status)); + close(fd); + if (status == TEST_FAILED) + return -1; + } + + return 0; + +create_failure: + printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n"); + return -1; +} diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c new file mode 100644 index 000000000000..73b81c632366 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> + +#include "vm_util.h" +#include "../kselftest.h" + +#define MMAP_SIZE (1 << 21) +#define INLOOP_ITER 100 + +char *huge_ptr; + +/* Touch the memory while it is being madvised() */ +void *touch(void *unused) +{ + char *ptr = (char *)huge_ptr; + + for (int i = 0; i < INLOOP_ITER; i++) + ptr[0] = '.'; + + return NULL; +} + +void *madv(void *unused) +{ + usleep(rand() % 10); + + for (int i = 0; i < INLOOP_ITER; i++) + madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + + return NULL; +} + +int main(void) +{ + unsigned long free_hugepages; + pthread_t thread1, thread2; + /* + * On kernel 6.4, we are able to reproduce the problem with ~1000 + * interactions + */ + int max = 10000; + + srand(getpid()); + + free_hugepages = get_free_hugepages(); + if (free_hugepages != 1) { + ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", + free_hugepages); + } + + while (max--) { + huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((unsigned long)huge_ptr == -1) + ksft_exit_skip("Failed to allocated huge page\n"); + + pthread_create(&thread1, NULL, madv, NULL); + pthread_create(&thread2, NULL, touch, NULL); + + pthread_join(thread1, NULL); + pthread_join(thread2, NULL); + munmap(huge_ptr, MMAP_SIZE); + } + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c new file mode 100644 index 000000000000..8f122a0f0828 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case that must run on a system with one and only one huge page available. + * # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * + * During setup, the test allocates the only available page, and starts three threads: + * - thread1: + * * madvise(MADV_DONTNEED) on the allocated huge page + * - thread 2: + * * Write to the allocated huge page + * - thread 3: + * * Try to allocated an extra huge page (which must not available) + * + * The test fails if thread3 is able to allocate a page. + * + * Touching the first page after thread3's allocation will raise a SIGBUS + * + * Author: Breno Leitao <leitao@debian.org> + */ +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> + +#include "vm_util.h" +#include "../kselftest.h" + +#define INLOOP_ITER 100 + +size_t mmap_size; +char *huge_ptr; + +/* Touch the memory while it is being madvised() */ +void *touch(void *unused) +{ + for (int i = 0; i < INLOOP_ITER; i++) + huge_ptr[0] = '.'; + + return NULL; +} + +void *madv(void *unused) +{ + for (int i = 0; i < INLOOP_ITER; i++) + madvise(huge_ptr, mmap_size, MADV_DONTNEED); + + return NULL; +} + +/* + * We got here, and there must be no huge page available for mapping + * The other hugepage should be flipping from used <-> reserved, because + * of madvise(DONTNEED). + */ +void *map_extra(void *unused) +{ + void *ptr; + + for (int i = 0; i < INLOOP_ITER; i++) { + ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((long)ptr != -1) { + /* Touching the other page now will cause a SIGBUG + * huge_ptr[0] = '1'; + */ + return ptr; + } + } + + return NULL; +} + +int main(void) +{ + pthread_t thread1, thread2, thread3; + unsigned long free_hugepages; + void *ret; + + /* + * On kernel 6.7, we are able to reproduce the problem with ~10 + * interactions + */ + int max = 10; + + free_hugepages = get_free_hugepages(); + + if (free_hugepages != 1) { + ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", + free_hugepages); + } + + mmap_size = default_huge_page_size(); + + while (max--) { + huge_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((unsigned long)huge_ptr == -1) { + ksft_test_result_fail("Failed to allocate huge page\n"); + return KSFT_FAIL; + } + + pthread_create(&thread1, NULL, madv, NULL); + pthread_create(&thread2, NULL, touch, NULL); + pthread_create(&thread3, NULL, map_extra, NULL); + + pthread_join(thread1, NULL); + pthread_join(thread2, NULL); + pthread_join(thread3, &ret); + + if (ret) { + ksft_test_result_fail("Unexpected huge page allocation\n"); + return KSFT_FAIL; + } + + /* Unmap and restart */ + munmap(huge_ptr, mmap_size); + } + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index d11d1febccc3..11f9bbe7dc22 100644..100755 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -1,13 +1,17 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi +nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) usage_file=usage_in_bytes if [[ "$1" == "-cgroup-v2" ]]; then @@ -15,19 +19,24 @@ if [[ "$1" == "-cgroup-v2" ]]; then usage_file=current fi -CGROUP_ROOT='/dev/cgroup/memory' -MNT='/mnt/huge/' -if [[ ! -e $CGROUP_ROOT ]]; then - mkdir -p $CGROUP_ROOT - if [[ $cgroup2 ]]; then +if [[ $cgroup2 ]]; then + CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory mount -t cgroup2 none $CGROUP_ROOT - sleep 1 - echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control - else + do_umount=1 + fi + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control +else + CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory mount -t cgroup memory,hugetlb $CGROUP_ROOT + do_umount=1 fi fi +MNT='/mnt/huge/' function get_machine_hugepage_size() { hpz=$(grep -i hugepagesize /proc/meminfo) @@ -240,5 +249,9 @@ cleanup echo ALL PASS -umount $CGROUP_ROOT -rm -rf $CGROUP_ROOT +if [[ $do_umount ]]; then + umount $CGROUP_ROOT + rm -rf $CGROUP_ROOT +fi + +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c new file mode 100644 index 000000000000..829320a519e7 --- /dev/null +++ b/tools/testing/selftests/mm/khugepaged.c @@ -0,0 +1,1285 @@ +#define _GNU_SOURCE +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <dirent.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> + +#include <linux/mman.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> + +#include "linux/magic.h" + +#include "vm_util.h" +#include "thp_settings.h" + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; +static int anon_order; + +#define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; + +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); + const char *name; +}; + +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; + +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); + bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; +}; + +static struct file_info finfo; +static bool skip_settings_restore; +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + +static void restore_settings_atexit(void) +{ + if (skip_settings_restore) + return; + + printf("Restore THP and khugepaged settings..."); + thp_restore_settings(); + success("OK"); + + skip_settings_restore = true; +} + +static void restore_settings(int sig) +{ + /* exit() will invoke the restore_settings_atexit handler. */ + exit(sig ? EXIT_FAILURE : exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + if (file_ops && finfo.type == VMA_FILE) + thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); + thp_save_settings(); + + success("OK"); + + atexit(restore_settings_atexit); + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(int nr) +{ + void *p; + + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(struct mem_ops *ops) +{ + void *p = ops->setup_area(1); + + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ + printf("Allocate huge page..."); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); + return p; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops __anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + int ret; + struct thp_settings settings = *thp_current_settings(); + + printf("%s...", msg); + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; + thp_push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) + fail("Fail: check_huge()"); + else + success("OK"); + + thp_pop_settings(); +} + +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = thp_read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (ops->check_huge(p, nr_hpages)) + break; + if (thp_read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); + + return timeout == -1; +} + +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages, ops)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } + + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + + if (ops->check_huge(p, expect ? nr_hpages : 0)) + success("OK"); + else + fail("Fail"); +} + +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + +static bool is_anon(struct mem_ops *ops) +{ + return ops == &__anon_ops; +} + +static void alloc_at_fault(void) +{ + struct thp_settings settings = *thp_current_settings(); + char *p; + + settings.thp_enabled = THP_ALWAYS; + thp_push_settings(&settings); + + p = alloc_mapping(1); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge_anon(p, 1, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + + thp_pop_settings(); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (check_huge_anon(p, 0, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + ops, true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, page_size); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct thp_settings settings = *thp_current_settings(); + void *p; + int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1; + + settings.khugepaged.max_ptes_none = max_ptes_none; + thp_push_settings(&settings); + + p = ops->setup_area(1); + + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + ops, !c->enforce_pte_scan_limits); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } +skip: + ops->cleanup_area(p, hpage_pmd_size); + thp_pop_settings(); +} + +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"); + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, + !c->enforce_pte_scan_limits); + validate_memory(p, 0, hpage_pmd_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + } +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, ops, true); + validate_memory(p, 0, page_size); +skip: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int i; + + p = ops->setup_area(1); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of different compound pages", p, 1, + ops, true); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = ops->setup_area(1); + + printf("Allocate small page..."); + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + ops->fault(p, page_size, 2 * page_size); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, ops, true); + + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + ops->fault(p, 0, page_size); + + thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, ops, true); + thp_write_num("khugepaged/max_ptes_shared", + thp_current_settings()->khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, ops, !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, ops, true); + } + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n"); + fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); + fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + fprintf(stderr, "\n\tSupported Options:\n"); + fprintf(stderr, "\t\t-h: This help message.\n"); + fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); + fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n"); + exit(1); +} + +static void parse_test_type(int argc, char **argv) +{ + int opt; + char *buf; + const char *token; + + while ((opt = getopt(argc, argv, "s:h")) != -1) { + switch (opt) { + case 's': + anon_order = atoi(optarg); + break; + case 'h': + default: + usage(); + } + } + + argv += optind; + argc -= optind; + + if (argc == 0) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[0]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 2) + usage(); + + get_finfo(argv[1]); +} + +int main(int argc, char **argv) +{ + int hpage_pmd_order; + struct thp_settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_ADVISE, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, + }; + + parse_test_type(argc, argv); + + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_pmd_pagesize(); + if (!hpage_pmd_size) { + printf("Reading PMD pagesize failed"); + exit(EXIT_FAILURE); + } + hpage_pmd_nr = hpage_pmd_size / page_size; + hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; + default_settings.hugepages[anon_order].enabled = THP_ALWAYS; + + save_settings(); + thp_push_settings(&default_settings); + + alloc_at_fault(); + +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + + restore_settings(0); +} diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c new file mode 100644 index 000000000000..37de82da9be7 --- /dev/null +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -0,0 +1,713 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KSM functional tests + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@redhat.com> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <linux/userfaultfd.h> + +#include "../kselftest.h" +#include "vm_util.h" + +#define KiB 1024u +#define MiB (1024 * KiB) +#define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child" + +#define MAP_MERGE_FAIL ((void *)-1) +#define MAP_MERGE_SKIP ((void *)-2) + +enum ksm_merge_mode { + KSM_MERGE_PRCTL, + KSM_MERGE_MADVISE, + KSM_MERGE_NONE, /* PRCTL already set */ +}; + +static int mem_fd; +static int ksm_fd; +static int ksm_full_scans_fd; +static int proc_self_ksm_stat_fd; +static int proc_self_ksm_merging_pages_fd; +static int ksm_use_zero_pages_fd; +static int pagemap_fd; +static size_t pagesize; + +static bool range_maps_duplicates(char *addr, unsigned long size) +{ + unsigned long offs_a, offs_b, pfn_a, pfn_b; + + /* + * There is no easy way to check if there are KSM pages mapped into + * this range. We only check that the range does not map the same PFN + * twice by comparing each pair of mapped pages. + */ + for (offs_a = 0; offs_a < size; offs_a += pagesize) { + pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); + /* Page not present or PFN not exposed by the kernel. */ + if (pfn_a == -1ul || !pfn_a) + continue; + + for (offs_b = offs_a + pagesize; offs_b < size; + offs_b += pagesize) { + pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); + if (pfn_b == -1ul || !pfn_b) + continue; + if (pfn_a == pfn_b) + return true; + } + } + return false; +} + +static long get_my_ksm_zero_pages(void) +{ + char buf[200]; + char *substr_ksm_zero; + size_t value_pos; + ssize_t read_size; + unsigned long my_ksm_zero_pages; + + if (!proc_self_ksm_stat_fd) + return 0; + + read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0); + if (read_size < 0) + return -errno; + + buf[read_size] = 0; + + substr_ksm_zero = strstr(buf, "ksm_zero_pages"); + if (!substr_ksm_zero) + return 0; + + value_pos = strcspn(substr_ksm_zero, "0123456789"); + my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10); + + return my_ksm_zero_pages; +} + +static long get_my_merging_pages(void) +{ + char buf[10]; + ssize_t ret; + + if (proc_self_ksm_merging_pages_fd < 0) + return proc_self_ksm_merging_pages_fd; + + ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +static long ksm_get_full_scans(void) +{ + char buf[10]; + ssize_t ret; + + ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +static int ksm_merge(void) +{ + long start_scans, end_scans; + + /* Wait for two full scans such that any possible merging happened. */ + start_scans = ksm_get_full_scans(); + if (start_scans < 0) + return start_scans; + if (write(ksm_fd, "1", 1) != 1) + return -errno; + do { + end_scans = ksm_get_full_scans(); + if (end_scans < 0) + return end_scans; + } while (end_scans < start_scans + 2); + + return 0; +} + +static int ksm_unmerge(void) +{ + if (write(ksm_fd, "2", 1) != 1) + return -errno; + return 0; +} + +static char *__mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) +{ + char *map; + char *err_map = MAP_MERGE_FAIL; + int ret; + + /* Stabilize accounting by disabling KSM completely. */ + if (ksm_unmerge()) { + ksft_print_msg("Disabling (unmerging) KSM failed\n"); + return err_map; + } + + if (get_my_merging_pages() > 0) { + ksft_print_msg("Still pages merged\n"); + return err_map; + } + + map = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + ksft_print_msg("mmap() failed\n"); + return err_map; + } + + /* Don't use THP. Ignore if THP are not around on a kernel. */ + if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { + ksft_print_msg("MADV_NOHUGEPAGE failed\n"); + goto unmap; + } + + /* Make sure each page contains the same values to merge them. */ + memset(map, val, size); + + if (mprotect(map, size, prot)) { + ksft_print_msg("mprotect() failed\n"); + err_map = MAP_MERGE_SKIP; + goto unmap; + } + + switch (mode) { + case KSM_MERGE_PRCTL: + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n"); + err_map = MAP_MERGE_SKIP; + goto unmap; + } else if (ret) { + ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n"); + goto unmap; + } + break; + case KSM_MERGE_MADVISE: + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_print_msg("MADV_MERGEABLE failed\n"); + goto unmap; + } + break; + case KSM_MERGE_NONE: + break; + } + + /* Run KSM to trigger merging and wait. */ + if (ksm_merge()) { + ksft_print_msg("Running KSM failed\n"); + goto unmap; + } + + /* + * Check if anything was merged at all. Ignore the zero page that is + * accounted differently (depending on kernel support). + */ + if (val && !get_my_merging_pages()) { + ksft_print_msg("No pages got merged\n"); + goto unmap; + } + + return map; +unmap: + munmap(map, size); + return err_map; +} + +static char *mmap_and_merge_range(char val, unsigned long size, int prot, + enum ksm_merge_mode mode) +{ + char *map; + char *ret = MAP_FAILED; + + map = __mmap_and_merge_range(val, size, prot, mode); + if (map == MAP_MERGE_FAIL) + ksft_test_result_fail("Merging memory failed"); + else if (map == MAP_MERGE_SKIP) + ksft_test_result_skip("Merging memory skipped"); + else + ret = map; + + return ret; +} + +static void test_unmerge(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); + if (map == MAP_FAILED) + return; + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_unmerge_zero_pages(void) +{ + const unsigned int size = 2 * MiB; + char *map; + unsigned int offs; + unsigned long pages_expected; + + ksft_print_msg("[RUN] %s\n", __func__); + + if (proc_self_ksm_stat_fd < 0) { + ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n"); + return; + } + if (ksm_use_zero_pages_fd < 0) { + ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + + /* Let KSM deduplicate zero pages. */ + map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); + if (map == MAP_FAILED) + return; + + /* Check if ksm_zero_pages is updated correctly after KSM merging */ + pages_expected = size / pagesize; + if (pages_expected != get_my_ksm_zero_pages()) { + ksft_test_result_fail("'ksm_zero_pages' updated after merging\n"); + goto unmap; + } + + /* Try to unmerge half of the region */ + if (madvise(map, size / 2, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + /* Check if ksm_zero_pages is updated correctly after unmerging */ + pages_expected /= 2; + if (pages_expected != get_my_ksm_zero_pages()) { + ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n"); + goto unmap; + } + + /* Trigger unmerging of the other half by writing to the pages. */ + for (offs = size / 2; offs < size; offs += pagesize) + *((unsigned int *)&map[offs]) = offs; + + /* Now we should have no zeropages remaining. */ + if (get_my_ksm_zero_pages()) { + ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n"); + goto unmap; + } + + /* Check if ksm zero pages are really unmerged */ + ksft_test_result(!range_maps_duplicates(map, size), + "KSM zero pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_unmerge_discarded(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); + if (map == MAP_FAILED) + return; + + /* Discard half of all mapped pages so we have pte_none() entries. */ + if (madvise(map, size / 2, MADV_DONTNEED)) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto unmap; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +#ifdef __NR_userfaultfd +static void test_unmerge_uffd_wp(void) +{ + struct uffdio_writeprotect uffd_writeprotect; + const unsigned int size = 2 * MiB; + struct uffdio_api uffdio_api; + char *map; + int uffd; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); + if (map == MAP_FAILED) + return; + + /* See if UFFD is around. */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto unmap; + } + + /* See if UFFD-WP is around. */ + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); + goto close_uffd; + } + + /* Register UFFD-WP, no need for an actual handler. */ + if (uffd_register(uffd, map, size, false, true, false)) { + ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); + goto close_uffd; + } + + /* Write-protect the range using UFFD-WP. */ + uffd_writeprotect.range.start = (unsigned long) map; + uffd_writeprotect.range.len = size; + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); + goto close_uffd; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto close_uffd; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +close_uffd: + close(uffd); +unmap: + munmap(map, size); +} +#endif + +/* Verify that KSM can be enabled / queried with prctl. */ +static void test_prctl(void) +{ + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + return; + } else if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + return; + } + + ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0); + if (ret < 0) { + ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n"); + return; + } else if (ret != 1) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 not effective\n"); + return; + } + + ret = prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); + if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + return; + } + + ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0); + if (ret < 0) { + ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n"); + return; + } else if (ret != 0) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 not effective\n"); + return; + } + + ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n"); +} + +static int test_child_ksm(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + /* Test if KSM is enabled for the process. */ + if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1) + return -1; + + /* Test if merge could really happen. */ + map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE); + if (map == MAP_MERGE_FAIL) + return -2; + else if (map == MAP_MERGE_SKIP) + return -3; + + munmap(map, size); + return 0; +} + +static void test_child_ksm_err(int status) +{ + if (status == -1) + ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); + else if (status == -2) + ksft_test_result_fail("Merge in child failed\n"); + else if (status == -3) + ksft_test_result_skip("Merge in child skipped\n"); +} + +/* Verify that prctl ksm flag is inherited. */ +static void test_prctl_fork(void) +{ + int ret, status; + pid_t child_pid; + + ksft_print_msg("[RUN] %s\n", __func__); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + return; + } else if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + return; + } + + child_pid = fork(); + if (!child_pid) { + exit(test_child_ksm()); + } else if (child_pid < 0) { + ksft_test_result_fail("fork() failed\n"); + return; + } + + if (waitpid(child_pid, &status, 0) < 0) { + ksft_test_result_fail("waitpid() failed\n"); + return; + } + + status = WEXITSTATUS(status); + if (status) { + test_child_ksm_err(status); + return; + } + + if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + return; + } + + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); +} + +static void test_prctl_fork_exec(void) +{ + int ret, status; + pid_t child_pid; + + ksft_print_msg("[RUN] %s\n", __func__); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + return; + } else if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + return; + } + + child_pid = fork(); + if (child_pid == -1) { + ksft_test_result_skip("fork() failed\n"); + return; + } else if (child_pid == 0) { + char *prg_name = "./ksm_functional_tests"; + char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME }; + + execv(prg_name, argv_for_program); + return; + } + + if (waitpid(child_pid, &status, 0) > 0) { + if (WIFEXITED(status)) { + status = WEXITSTATUS(status); + if (status) { + test_child_ksm_err(status); + return; + } + } else { + ksft_test_result_fail("program didn't terminate normally\n"); + return; + } + } else { + ksft_test_result_fail("waitpid() failed\n"); + return; + } + + if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + return; + } + + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); +} + +static void test_prctl_unmerge(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL); + if (map == MAP_FAILED) + return; + + if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_prot_none(void) +{ + const unsigned int size = 2 * MiB; + char *map; + int i; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE); + if (map == MAP_FAILED) + goto unmap; + + /* Store a unique value in each page on one half using ptrace */ + for (i = 0; i < size / 2; i += pagesize) { + lseek(mem_fd, (uintptr_t) map + i, SEEK_SET); + if (write(mem_fd, &i, sizeof(i)) != sizeof(i)) { + ksft_test_result_fail("ptrace write failed\n"); + goto unmap; + } + } + + /* Trigger unsharing on the other half. */ + if (madvise(map + size / 2, size / 2, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +int main(int argc, char **argv) +{ + unsigned int tests = 8; + int err; + + if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { + exit(test_child_ksm()); + } + +#ifdef __NR_userfaultfd + tests++; +#endif + + ksft_print_header(); + ksft_set_plan(tests); + + pagesize = getpagesize(); + + mem_fd = open("/proc/self/mem", O_RDWR); + if (mem_fd < 0) + ksft_exit_fail_msg("opening /proc/self/mem failed\n"); + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); + proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", + O_RDONLY); + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + + test_unmerge(); + test_unmerge_zero_pages(); + test_unmerge_discarded(); +#ifdef __NR_userfaultfd + test_unmerge_uffd_wp(); +#endif + + test_prot_none(); + + test_prctl(); + test_prctl_fork(); + test_prctl_fork_exec(); + test_prctl_unmerge(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c new file mode 100644 index 000000000000..b748c48908d9 --- /dev/null +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -0,0 +1,948 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <stdbool.h> +#include <time.h> +#include <string.h> +#include <numa.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdint.h> +#include <err.h> + +#include "../kselftest.h" +#include <include/vdso/time64.h> +#include "vm_util.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define KSM_MERGE_TYPE_DEFAULT 0 +#define MB (1ul << 20) + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +enum ksm_merge_type { + KSM_MERGE_MADVISE, + KSM_MERGE_PRCTL, + KSM_MERGE_LAST = KSM_MERGE_PRCTL +}; + +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE, + CHECK_KSM_GET_MERGE_TYPE, + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME, + KSM_MERGE_TIME_HUGE_PAGES, + KSM_UNMERGE_TIME, + KSM_COW_TIME +}; + +int debug; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static void ksm_print_sysfs(void) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + unsigned long full_scans, pages_unshared, pages_volatile; + unsigned long stable_node_chains, stable_node_dups; + long general_profit; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing) || + ksm_read_sysfs(KSM_FP("full_scans"), &full_scans) || + ksm_read_sysfs(KSM_FP("pages_unshared"), &pages_unshared) || + ksm_read_sysfs(KSM_FP("pages_volatile"), &pages_volatile) || + ksm_read_sysfs(KSM_FP("stable_node_chains"), &stable_node_chains) || + ksm_read_sysfs(KSM_FP("stable_node_dups"), &stable_node_dups) || + ksm_read_sysfs(KSM_FP("general_profit"), (unsigned long *)&general_profit)) + return; + + printf("pages_shared : %lu\n", pages_shared); + printf("pages_sharing : %lu\n", pages_sharing); + printf("max_page_sharing : %lu\n", max_page_sharing); + printf("full_scans : %lu\n", full_scans); + printf("pages_unshared : %lu\n", pages_unshared); + printf("pages_volatile : %lu\n", pages_volatile); + printf("stable_node_chains: %lu\n", stable_node_chains); + printf("stable_node_dups : %lu\n", stable_node_dups); + printf("general_profit : %ld\n", general_profit); +} + +static void ksm_print_procfs(void) +{ + const char *file_name = "/proc/self/ksm_stat"; + char buffer[512]; + FILE *f = fopen(file_name, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_name); + perror("fopen"); + return; + } + + while (fgets(buffer, sizeof(buffer), f)) + printf("%s", buffer); + + fclose(f); +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); + + printf("Supported <test type>:\n" + " -M (page merging)\n" + " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -D evaluate unmerging time and speed when disabling KSM.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); + + printf(" -a: specify the access protections of pages.\n" + " <prot> must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -d: turn debugging output on\n"); + printf(" -s: the size of duplicated memory area (in MiB)\n"); + printf(" -t: KSM merge type\n" + " Default: 0\n" + " 0: madvise merging\n" + " 1: prctl merging\n"); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(int merge_type, void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (merge_type == KSM_MERGE_MADVISE) { + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + } else if (merge_type == KSM_MERGE_PRCTL) { + if (prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0)) { + perror("prctl"); + return 1; + } + } + + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static int ksm_unmerge_pages(void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_UNMERGEABLE)) { + perror("madvise"); + return 1; + } + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + if (debug) { + ksm_print_sysfs(); + ksm_print_procfs(); + } + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int merge_type, int mapping, int prot, + long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + if (merge_type == KSM_MERGE_PRCTL) + prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_unmerge(int merge_type, int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_zero_page_merge(int merge_type, int mapping, int prot, long page_count, + int timeout, bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int get_next_mem_node(int node) +{ + + long node_size; + int mem_node = 0; + int i, max_node = numa_max_node(); + + for (i = node + 1; i <= max_node + node; i++) { + mem_node = i % (max_node + 1); + node_size = numa_node_size(mem_node, NULL); + if (node_size > 0) + break; + } + return mem_node; +} + +static int get_first_mem_node(void) +{ + return get_next_mem_node(numa_max_node()); +} + +static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeout, + bool merge_across_nodes, size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + int first_node; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_num_configured_nodes() <= 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + first_node = get_first_mem_node(); + numa1_map_ptr = numa_alloc_onnode(page_size, first_node); + numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(merge_type, numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(merge_type, numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + +static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, + int timeout, size_t map_size) +{ + void *map_ptr, *map_ptr_orig; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + int pagemap_fd, n_normal_pages, n_huge_pages; + + map_size *= MB; + size_t len = map_size; + + len -= len % HPAGE_SIZE; + map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; + + if (map_ptr_orig == MAP_FAILED) + err(2, "initial mmap"); + + if (madvise(map_ptr, len, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + n_normal_pages = 0; + n_huge_pages = 0; + for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { + if (allocate_transhuge(p, pagemap_fd) < 0) + n_normal_pages++; + else + n_huge_pages++; + } + printf("Number of normal pages: %d\n", n_normal_pages); + printf("Number of huge pages: %d\n", n_huge_pages); + + memset(map_ptr, '*', len); + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_FAIL; +} + +static int ksm_merge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_cow_time(int merge_type, int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + int merge_type = KSM_MERGE_TYPE_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; + + while ((opt = getopt(argc, argv, "dha:p:l:z:m:s:t:MUZNPCHD")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; + case 'd': + debug = 1; + break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 't': + { + int tmp = atoi(optarg); + + if (tmp < 0 || tmp > KSM_MERGE_LAST) { + printf("Invalid merge type\n"); + return KSFT_FAIL; + } + merge_type = tmp; + } + break; + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; + case 'P': + test_name = KSM_MERGE_TIME; + break; + case 'H': + test_name = KSM_MERGE_TIME_HUGE_PAGES; + break; + case 'D': + test_name = KSM_UNMERGE_TIME; + break; + case 'C': + test_name = KSM_COW_TIME; + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + page_count, ksm_scan_limit_sec, use_zero_pages, + page_size); + break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, merge_across_nodes, page_size); + break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_MERGE_TIME_HUGE_PAGES: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_UNMERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_unmerge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_COW_TIME: + ret = ksm_cow_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, page_size); + break; + } + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c new file mode 100644 index 000000000000..ef7d911da13e --- /dev/null +++ b/tools/testing/selftests/mm/madv_populate.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests + * + * Copyright 2021, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@redhat.com> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/mman.h> +#include <sys/mman.h> + +#include "../kselftest.h" +#include "vm_util.h" + +/* + * For now, we're using 2 MiB of private anonymous memory for all tests. + */ +#define SIZE (2 * 1024 * 1024) + +static size_t pagesize; + +static void sense_support(void) +{ + char *addr; + int ret; + + addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!addr) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_READ); + if (ret) + ksft_exit_skip("MADV_POPULATE_READ is not available\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); + if (ret) + ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); + + munmap(addr, pagesize); +} + +static void test_prot_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_WRITE with PROT_READ\n"); + + munmap(addr, SIZE); +} + +static void test_prot_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_READ with PROT_WRITE\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); + + munmap(addr, SIZE); +} + +static void test_holes(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ret = munmap(addr + pagesize, pagesize); + if (ret) + ksft_exit_fail_msg("munmap failed\n"); + + /* Hole in the middle */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes in the middle\n"); + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes in the middle\n"); + + /* Hole at end */ + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the end\n"); + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the end\n"); + + /* Hole at beginning */ + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the beginning\n"); + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the beginning\n"); + + munmap(addr, SIZE); +} + +static bool range_is_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_populate_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static void test_populate_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static bool range_is_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_softdirty(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear any softdirty bits. */ + clear_softdirty(); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating READ should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating WRITE should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_softdirty(addr, SIZE), + "range is softdirty\n"); + + munmap(addr, SIZE); +} + +static int system_has_softdirty(void) +{ + /* + * There is no way to check if the kernel supports soft-dirty, other + * than by writing to a page and seeing if the bit was set. But the + * tests are intended to check that the bit gets set when it should, so + * doing that check would turn a potentially legitimate fail into a + * skip. Fortunately, we know for sure that arm64 does not support + * soft-dirty. So for now, let's just use the arch as a corse guide. + */ +#if defined(__aarch64__) + return 0; +#else + return 1; +#endif +} + +int main(int argc, char **argv) +{ + int nr_tests = 16; + int err; + + pagesize = getpagesize(); + + if (system_has_softdirty()) + nr_tests += 5; + + ksft_print_header(); + ksft_set_plan(nr_tests); + + sense_support(); + test_prot_read(); + test_prot_write(); + test_holes(); + test_populate_read(); + test_populate_write(); + if (system_has_softdirty()) + test_softdirty(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c new file mode 100644 index 000000000000..b74813fdc951 --- /dev/null +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test that MAP_FIXED_NOREPLACE works. + * + * Copyright 2018, Jann Horn <jannh@google.com> + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ + +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include "../kselftest.h" + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +static unsigned long find_base_addr(unsigned long size) +{ + void *addr; + unsigned long flags; + + flags = MAP_PRIVATE | MAP_ANONYMOUS; + addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n"); + + if (munmap(addr, size) != 0) + ksft_exit_fail_msg("Error: munmap failed\n"); + + return (unsigned long)addr; +} + +int main(void) +{ + unsigned long base_addr; + unsigned long flags, addr, size, page_size; + char *p; + + ksft_print_header(); + ksft_set_plan(9); + + page_size = sysconf(_SC_PAGE_SIZE); + + /* let's find a base addr that is free before we start the tests */ + size = 5 * page_size; + base_addr = find_base_addr(size); + + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; + + /* Check we can map all the areas we need below */ + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p == MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n"); + } + if (munmap((void *)addr, 5 * page_size) != 0) { + dump_maps(); + ksft_exit_fail_msg("Error: munmap failed!?\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + addr = base_addr + page_size; + size = 3 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p == MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + /* + * Exact same mapping again: + * base | free | new + * +1 | mapped | new + * +2 | mapped | new + * +3 | mapped | new + * +4 | free | new + */ + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p != MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + /* + * Second mapping contained within first: + * + * base | free | + * +1 | mapped | + * +2 | mapped | new + * +3 | mapped | + * +4 | free | + */ + addr = base_addr + (2 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p != MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + /* + * Overlap end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | new + * +4 | free | new + */ + addr = base_addr + (3 * page_size); + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p != MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + /* + * Overlap start of existing mapping: + * base | free | new + * +1 | mapped | new + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + addr = base_addr; + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p != MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + /* + * Adjacent to start of existing mapping: + * base | free | new + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + addr = base_addr; + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p == MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + /* + * Adjacent to end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | new + */ + addr = base_addr + (4 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + if (p == MAP_FAILED) { + dump_maps(); + ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n"); + } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + addr = base_addr; + size = 5 * page_size; + if (munmap((void *)addr, size) != 0) { + dump_maps(); + ksft_exit_fail_msg("Error: munmap failed!?\n"); + } + ksft_test_result_pass("Base Address unmap() successful\n"); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c new file mode 100644 index 000000000000..a1f005a90a4f --- /dev/null +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Example of using hugepage memory in a user application using the mmap + * system call with MAP_HUGETLB flag. Before running this program make + * sure the administrator has allocated enough default sized huge pages + * to cover the 256 MB allocation. + * + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> +#include "vm_util.h" +#include "../kselftest.h" + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void check_bytes(char *addr) +{ + ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static void read_bytes(char *addr, size_t length) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < length; i++) + if (*(addr + i) != (char)i) + ksft_exit_fail_msg("Mismatch at %lu\n", i); + + ksft_test_result_pass("Read correct data\n"); +} + +int main(int argc, char **argv) +{ + void *addr; + size_t hugepage_size; + size_t length = LENGTH; + int flags = FLAGS; + int shift = 0; + + hugepage_size = default_huge_page_size(); + /* munmap with fail if the length is not page aligned */ + if (hugepage_size > length) + length = hugepage_size; + + ksft_print_header(); + ksft_set_plan(1); + + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) { + shift = atoi(argv[2]); + if (shift) + flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } + + if (shift) + ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10)); + else + ksft_print_msg("Default size hugepages\n"); + ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + + addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + + ksft_print_msg("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + read_bytes(addr, length); + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, length)) + ksft_exit_fail_msg("munmap: %s\n", strerror(errno)); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c new file mode 100644 index 000000000000..5c8a53869b1b --- /dev/null +++ b/tools/testing/selftests/mm/map_populate.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Dmitry Safonov, Arista Networks + * + * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include "../kselftest.h" + +#define MMAP_SZ 4096 + +#define BUG_ON(condition, description) \ + do { \ + if (condition) \ + ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \ + __func__, __LINE__, (description), \ + strerror(errno)); \ + } while (0) + +#define TESTS_IN_CHILD 2 + +static void parent_f(int sock, unsigned long *smap, int child) +{ + int status, ret; + + ret = read(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + *smap = 0x22222BAD; + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = write(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + waitpid(child, &status, 0); + + /* The ksft macros don't keep counters between processes */ + ksft_cnt.ksft_pass = WEXITSTATUS(status); + ksft_cnt.ksft_fail = TESTS_IN_CHILD - WEXITSTATUS(status); +} + +static int child_f(int sock, unsigned long *smap, int fd) +{ + int ret, buf = 0; + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_POPULATE, fd, 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); + + ret = write(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + ret = read(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + ksft_test_result(*smap != 0x22222BAD, "MAP_POPULATE COW private page\n"); + ksft_test_result(*smap == 0xdeadbabe, "The mapping state\n"); + + /* The ksft macros don't keep counters between processes */ + return ksft_cnt.ksft_pass; +} + +int main(int argc, char **argv) +{ + int sock[2], child, ret; + FILE *ftmp; + unsigned long *smap; + + ksft_print_header(); + ksft_set_plan(TESTS_IN_CHILD); + + ftmp = tmpfile(); + BUG_ON(!ftmp, "tmpfile()"); + + ret = ftruncate(fileno(ftmp), MMAP_SZ); + BUG_ON(ret, "ftruncate()"); + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(ftmp), 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + *smap = 0xdeadbabe; + /* Probably unnecessary, but let it be. */ + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); + BUG_ON(ret, "socketpair()"); + + child = fork(); + BUG_ON(child == -1, "fork()"); + + if (child) { + ret = close(sock[0]); + BUG_ON(ret, "close()"); + + parent_f(sock[1], smap, child); + + ksft_finished(); + } + + ret = close(sock[1]); + BUG_ON(ret, "close()"); + + return child_f(sock[0], smap, fileno(ftmp)); +} diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c new file mode 100644 index 000000000000..200bedcdc32e --- /dev/null +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifdef __aarch64__ +#include <asm/hwcap.h> +#endif + +#include <linux/mman.h> +#include <linux/prctl.h> + +#include <stdio.h> +#include <stdlib.h> +#include <sys/auxv.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../kselftest_harness.h" + +#ifndef __aarch64__ +# define PROT_BTI 0 +#endif + +TEST(prctl_flags) +{ + EXPECT_LT(prctl(PR_SET_MDWE, PR_MDWE_NO_INHERIT, 0L, 0L, 7L), 0); + EXPECT_EQ(errno, EINVAL); + + EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0); + EXPECT_EQ(errno, EINVAL); + + EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0); + EXPECT_EQ(errno, EINVAL); +} + +FIXTURE(consecutive_prctl_flags) {}; +FIXTURE_SETUP(consecutive_prctl_flags) {} +FIXTURE_TEARDOWN(consecutive_prctl_flags) {} + +FIXTURE_VARIANT(consecutive_prctl_flags) +{ + unsigned long first_flags; + unsigned long second_flags; + bool should_work; +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_no_flags) +{ + .first_flags = 0, + .second_flags = 0, + .should_work = true, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_exec_gain) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .should_work = true, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_both_flags) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .should_work = true, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_mdwe) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .second_flags = 0, + .should_work = false, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_mdwe_no_inherit) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .second_flags = 0, + .should_work = false, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_no_inherit) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .should_work = false, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_enable_no_inherit) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .should_work = false, +}; + +TEST_F(consecutive_prctl_flags, two_prctls) +{ + int ret; + + EXPECT_EQ(prctl(PR_SET_MDWE, variant->first_flags, 0L, 0L, 0L), 0); + + ret = prctl(PR_SET_MDWE, variant->second_flags, 0L, 0L, 0L); + if (variant->should_work) { + EXPECT_EQ(ret, 0); + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, variant->second_flags); + } else { + EXPECT_NE(ret, 0); + ASSERT_EQ(errno, EPERM); + } +} + +FIXTURE(mdwe) +{ + void *p; + int flags; + size_t size; + pid_t pid; +}; + +FIXTURE_VARIANT(mdwe) +{ + bool enabled; + bool forked; + bool inherit; +}; + +FIXTURE_VARIANT_ADD(mdwe, stock) +{ + .enabled = false, + .forked = false, + .inherit = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, enabled) +{ + .enabled = true, + .forked = false, + .inherit = true, +}; + +FIXTURE_VARIANT_ADD(mdwe, inherited) +{ + .enabled = true, + .forked = true, + .inherit = true, +}; + +FIXTURE_VARIANT_ADD(mdwe, not_inherited) +{ + .enabled = true, + .forked = true, + .inherit = false, +}; + +static bool executable_map_should_fail(const FIXTURE_VARIANT(mdwe) *variant) +{ + return variant->enabled && (!variant->forked || variant->inherit); +} + +FIXTURE_SETUP(mdwe) +{ + unsigned long mdwe_flags; + int ret, status; + + self->p = NULL; + self->flags = MAP_SHARED | MAP_ANONYMOUS; + self->size = getpagesize(); + + if (!variant->enabled) + return; + + mdwe_flags = PR_MDWE_REFUSE_EXEC_GAIN; + if (!variant->inherit) + mdwe_flags |= PR_MDWE_NO_INHERIT; + + ret = prctl(PR_SET_MDWE, mdwe_flags, 0L, 0L, 0L); + ASSERT_EQ(ret, 0) { + TH_LOG("PR_SET_MDWE failed or unsupported"); + } + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, mdwe_flags); + + if (variant->forked) { + self->pid = fork(); + ASSERT_GE(self->pid, 0) { + TH_LOG("fork failed\n"); + } + + if (self->pid > 0) { + ret = waitpid(self->pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + exit(WEXITSTATUS(status)); + } + } +} + +FIXTURE_TEARDOWN(mdwe) +{ + if (self->p && self->p != MAP_FAILED) + munmap(self->p, self->size); +} + +TEST_F(mdwe, mmap_READ_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + EXPECT_NE(self->p, MAP_FAILED); +} + +TEST_F(mdwe, mmap_WRITE_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0); + if (executable_map_should_fail(variant)) { + EXPECT_EQ(self->p, MAP_FAILED); + } else { + EXPECT_NE(self->p, MAP_FAILED); + } +} + +TEST_F(mdwe, mprotect_stay_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + EXPECT_EQ(ret, 0); +} + +TEST_F(mdwe, mprotect_add_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + if (executable_map_should_fail(variant)) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mprotect_WRITE_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC); + if (executable_map_should_fail(variant)) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mmap_FIXED) +{ + void *p; + + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + /* MAP_FIXED unmaps the existing page before mapping which is allowed */ + p = mmap(self->p, self->size, PROT_READ | PROT_EXEC, + self->flags | MAP_FIXED, 0, 0); + EXPECT_EQ(p, self->p); +} + +TEST_F(mdwe, arm64_BTI) +{ + int ret; + +#ifdef __aarch64__ + if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI)) +#endif + SKIP(return, "HWCAP2_BTI not supported"); + + self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c new file mode 100644 index 000000000000..9a0597310a76 --- /dev/null +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport <rppt@linux.ibm.com> + */ + +#define _GNU_SOURCE +#include <sys/uio.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/resource.h> +#include <sys/capability.h> + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned int flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + if (len % page_size != 0) + len = (len/page_size) * page_size; + + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void test_vmsplice(int fd, const char *desc) +{ + ssize_t transferred; + struct iovec iov; + int pipefd[2]; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + goto close_pipe; + } + + /* + * vmsplice() may use GUP-fast, which must also fail. Prefault the + * page table, so GUP-fast could find it. + */ + memset(mem, PATTERN, page_size); + + iov.iov_base = mem; + iov.iov_len = page_size; + transferred = vmsplice(pipefd[1], &iov, 1, 0); + + if (transferred < 0 && errno == EFAULT) + pass("vmsplice is blocked as expected with %s\n", desc); + else + fail("vmsplice: unexpected memory access with %s\n", desc); + + munmap(mem, page_size); +close_pipe: + close(pipefd[0]); + close(pipefd[1]); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 6 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + if (ftruncate(fd, page_size)) + ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno)); + + test_mlock_limit(fd); + test_file_apis(fd); + /* + * We have to run the first vmsplice test before any secretmem page was + * allocated for this fd. + */ + test_vmsplice(fd, "fresh page"); + test_vmsplice(fd, "existing page"); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_finished(); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c new file mode 100644 index 000000000000..6908569ef406 --- /dev/null +++ b/tools/testing/selftests/mm/migration.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The main purpose of the tests here is to exercise the migration entry code + * paths in the kernel. + */ + +#include "../kselftest_harness.h" +#include <strings.h> +#include <pthread.h> +#include <numa.h> +#include <numaif.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <signal.h> +#include <time.h> + +#define TWOMEG (2<<20) +#define RUNTIME (20) + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(migration) +{ + pthread_t *threads; + pid_t *pids; + int nthreads; + int n1; + int n2; +}; + +FIXTURE_SETUP(migration) +{ + int n; + + ASSERT_EQ(numa_available(), 0); + self->nthreads = numa_num_task_cpus() - 1; + self->n1 = -1; + self->n2 = -1; + + for (n = 0; n < numa_max_possible_node(); n++) + if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { + if (self->n1 == -1) { + self->n1 = n; + } else { + self->n2 = n; + break; + } + } + + self->threads = malloc(self->nthreads * sizeof(*self->threads)); + ASSERT_NE(self->threads, NULL); + self->pids = malloc(self->nthreads * sizeof(*self->pids)); + ASSERT_NE(self->pids, NULL); +}; + +FIXTURE_TEARDOWN(migration) +{ + free(self->threads); + free(self->pids); +} + +int migrate(uint64_t *ptr, int n1, int n2) +{ + int ret, tmp; + int status = 0; + struct timespec ts1, ts2; + + if (clock_gettime(CLOCK_MONOTONIC, &ts1)) + return -1; + + while (1) { + if (clock_gettime(CLOCK_MONOTONIC, &ts2)) + return -1; + + if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) + return 0; + + ret = move_pages(0, 1, (void **) &ptr, &n2, &status, + MPOL_MF_MOVE_ALL); + if (ret) { + if (ret > 0) + printf("Didn't migrate %d pages\n", ret); + else + perror("Couldn't migrate pages"); + return -2; + } + + tmp = n2; + n2 = n1; + n1 = tmp; + } + + return 0; +} + +void *access_mem(void *ptr) +{ + volatile uint64_t y = 0; + volatile uint64_t *x = ptr; + + while (1) { + pthread_testcancel(); + y += *x; + + /* Prevent the compiler from optimizing out the writes to y: */ + asm volatile("" : "+r" (y)); + } + + return NULL; +} + +/* + * Basic migration entry testing. One thread will move pages back and forth + * between nodes whilst other threads try and access them triggering the + * migration entry wait paths in the kernel. + */ +TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * Same as the previous test but with shared memory. + */ +TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGHUP); + /* Parent may have died before prctl so check now. */ + if (getppid() == 1) + kill(getpid(), SIGHUP); + access_mem(ptr); + } else { + self->pids[i] = pid; + } + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * Tests the pmd migration entry paths. + */ +TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c new file mode 100644 index 000000000000..b8a7efe9204e --- /dev/null +++ b/tools/testing/selftests/mm/mkdirty.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test handling of code that might set PTE/PMD dirty in read-only VMAs. + * Setting a PTE/PMD dirty must not accidentally set the PTE/PMD writable. + * + * Copyright 2023, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@redhat.com> + */ +#include <fcntl.h> +#include <signal.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> +#include <sys/mman.h> +#include <setjmp.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <linux/userfaultfd.h> +#include <linux/mempolicy.h> + +#include "../kselftest.h" +#include "vm_util.h" + +static size_t pagesize; +static size_t thpsize; +static int mem_fd; +static int pagemap_fd; +static sigjmp_buf env; + +static void signal_handler(int sig) +{ + if (sig == SIGSEGV) + siglongjmp(env, 1); + siglongjmp(env, 2); +} + +static void do_test_write_sigsegv(char *mem) +{ + char orig = *mem; + int ret; + + if (signal(SIGSEGV, signal_handler) == SIG_ERR) { + ksft_test_result_fail("signal() failed\n"); + return; + } + + ret = sigsetjmp(env, 1); + if (!ret) + *mem = orig + 1; + + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) + ksft_test_result_fail("signal() failed\n"); + + ksft_test_result(ret == 1 && *mem == orig, + "SIGSEGV generated, page not modified\n"); +} + +static char *mmap_thp_range(int prot, char **_mmap_mem, size_t *_mmap_size) +{ + const size_t mmap_size = 2 * thpsize; + char *mem, *mmap_mem; + + mmap_mem = mmap(NULL, mmap_size, prot, MAP_PRIVATE|MAP_ANON, + -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return MAP_FAILED; + } + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + if (madvise(mem, thpsize, MADV_HUGEPAGE)) { + ksft_test_result_skip("MADV_HUGEPAGE failed\n"); + munmap(mmap_mem, mmap_size); + return MAP_FAILED; + } + + *_mmap_mem = mmap_mem; + *_mmap_size = mmap_size; + return mem; +} + +static void test_ptrace_write(void) +{ + char data = 1; + char *mem; + int ret; + + ksft_print_msg("[INFO] PTRACE write access\n"); + + mem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* Fault in the shared zeropage. */ + if (*mem != 0) { + ksft_test_result_fail("Memory not zero\n"); + goto munmap; + } + + /* + * Unshare the page (populating a fresh anon page that might be set + * dirty in the PTE) in the read-only VMA using ptrace (FOLL_FORCE). + */ + lseek(mem_fd, (uintptr_t) mem, SEEK_SET); + ret = write(mem_fd, &data, 1); + if (ret != 1 || *mem != data) { + ksft_test_result_fail("write() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mem, pagesize); +} + +static void test_ptrace_write_thp(void) +{ + char *mem, *mmap_mem; + size_t mmap_size; + char data = 1; + int ret; + + ksft_print_msg("[INFO] PTRACE write access to THP\n"); + + mem = mmap_thp_range(PROT_READ, &mmap_mem, &mmap_size); + if (mem == MAP_FAILED) + return; + + /* + * Write to the first subpage in the read-only VMA using + * ptrace(FOLL_FORCE), eventually placing a fresh THP that is marked + * dirty in the PMD. + */ + lseek(mem_fd, (uintptr_t) mem, SEEK_SET); + ret = write(mem_fd, &data, 1); + if (ret != 1 || *mem != data) { + ksft_test_result_fail("write() failed\n"); + goto munmap; + } + + /* MM populated a THP if we got the last subpage populated as well. */ + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mmap_mem, mmap_size); +} + +static void test_page_migration(void) +{ + char *mem; + + ksft_print_msg("[INFO] Page migration\n"); + + mem = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, + -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* Populate a fresh page and dirty it. */ + memset(mem, 1, pagesize); + if (mprotect(mem, pagesize, PROT_READ)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + /* Trigger page migration. Might not be available or fail. */ + if (syscall(__NR_mbind, mem, pagesize, MPOL_LOCAL, NULL, 0x7fful, + MPOL_MF_MOVE)) { + ksft_test_result_skip("mbind() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mem, pagesize); +} + +static void test_page_migration_thp(void) +{ + char *mem, *mmap_mem; + size_t mmap_size; + + ksft_print_msg("[INFO] Page migration of THP\n"); + + mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size); + if (mem == MAP_FAILED) + return; + + /* + * Write to the first page, which might populate a fresh anon THP + * and dirty it. + */ + memset(mem, 1, pagesize); + if (mprotect(mem, thpsize, PROT_READ)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + /* MM populated a THP if we got the last subpage populated as well. */ + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + + /* Trigger page migration. Might not be available or fail. */ + if (syscall(__NR_mbind, mem, thpsize, MPOL_LOCAL, NULL, 0x7fful, + MPOL_MF_MOVE)) { + ksft_test_result_skip("mbind() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mmap_mem, mmap_size); +} + +static void test_pte_mapped_thp(void) +{ + char *mem, *mmap_mem; + size_t mmap_size; + + ksft_print_msg("[INFO] PTE-mapping a THP\n"); + + mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size); + if (mem == MAP_FAILED) + return; + + /* + * Write to the first page, which might populate a fresh anon THP + * and dirty it. + */ + memset(mem, 1, pagesize); + if (mprotect(mem, thpsize, PROT_READ)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + /* MM populated a THP if we got the last subpage populated as well. */ + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + + /* Trigger PTE-mapping the THP by mprotect'ing the last subpage. */ + if (mprotect(mem + thpsize - pagesize, pagesize, + PROT_READ|PROT_WRITE)) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + + do_test_write_sigsegv(mem); +munmap: + munmap(mmap_mem, mmap_size); +} + +#ifdef __NR_userfaultfd +static void test_uffdio_copy(void) +{ + struct uffdio_register uffdio_register; + struct uffdio_copy uffdio_copy; + struct uffdio_api uffdio_api; + char *dst, *src; + int uffd; + + ksft_print_msg("[INFO] UFFDIO_COPY\n"); + + src = malloc(pagesize); + memset(src, 1, pagesize); + dst = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0); + if (dst == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto munmap; + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + + uffdio_register.range.start = (unsigned long) dst; + uffdio_register.range.len = pagesize; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + ksft_test_result_fail("UFFDIO_REGISTER failed\n"); + goto close_uffd; + } + + /* Place a page in a read-only VMA, which might set the PTE dirty. */ + uffdio_copy.dst = (unsigned long) dst; + uffdio_copy.src = (unsigned long) src; + uffdio_copy.len = pagesize; + uffdio_copy.mode = 0; + if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) { + ksft_test_result_fail("UFFDIO_COPY failed\n"); + goto close_uffd; + } + + do_test_write_sigsegv(dst); +close_uffd: + close(uffd); +munmap: + munmap(dst, pagesize); + free(src); +} +#endif /* __NR_userfaultfd */ + +int main(void) +{ + int err, tests = 2; + + pagesize = getpagesize(); + thpsize = read_pmd_pagesize(); + if (thpsize) { + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); + tests += 3; + } +#ifdef __NR_userfaultfd + tests += 1; +#endif /* __NR_userfaultfd */ + + ksft_print_header(); + ksft_set_plan(tests); + + mem_fd = open("/proc/self/mem", O_RDWR); + if (mem_fd < 0) + ksft_exit_fail_msg("opening /proc/self/mem failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening /proc/self/pagemap failed\n"); + + /* + * On some ptrace(FOLL_FORCE) write access via /proc/self/mem in + * read-only VMAs, the kernel may set the PTE/PMD dirty. + */ + test_ptrace_write(); + if (thpsize) + test_ptrace_write_thp(); + /* + * On page migration, the kernel may set the PTE/PMD dirty when + * remapping the page. + */ + test_page_migration(); + if (thpsize) + test_page_migration_thp(); + /* PTE-mapping a THP might propagate the dirty PMD bit to the PTEs. */ + if (thpsize) + test_pte_mapped_thp(); + /* Placing a fresh page via userfaultfd may set the PTE dirty. */ +#ifdef __NR_userfaultfd + test_uffdio_copy(); +#endif /* __NR_userfaultfd */ + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c new file mode 100644 index 000000000000..1cd80b0f76c3 --- /dev/null +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * It tests the mlock/mlock2() when they are invoked + * on randomly memory region. + */ +#include <unistd.h> +#include <sys/resource.h> +#include <sys/capability.h> +#include <sys/mman.h> +#include <linux/mman.h> +#include <fcntl.h> +#include <string.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <time.h> +#include "../kselftest.h" +#include "mlock2.h" + +#define CHUNK_UNIT (128 * 1024) +#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) +#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT +#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) + +#define TEST_LOOP 100 +#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) + +int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + ksft_perror("setrlimit() returns error\n"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + ksft_perror("cap_set_proc() returns error\n"); + return -1; + } + + return 0; +} + +int get_proc_locked_vm_size(void) +{ + FILE *f; + int ret = -1; + char line[1024] = {0}; + unsigned long lock_size = 0; + + f = fopen("/proc/self/status", "r"); + if (!f) + ksft_exit_fail_msg("fopen: %s\n", strerror(errno)); + + while (fgets(line, 1024, f)) { + if (strstr(line, "VmLck")) { + ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); + if (ret <= 0) { + fclose(f); + ksft_exit_fail_msg("sscanf() on VmLck error: %s: %d\n", + line, ret); + } + fclose(f); + return (int)(lock_size << 10); + } + } + + fclose(f); + ksft_exit_fail_msg("cannot parse VmLck in /proc/self/status: %s\n", strerror(errno)); + return -1; +} + +/* + * Get the MMUPageSize of the memory region including input + * address from proc file. + * + * return value: on error case, 0 will be returned. + * Otherwise the page size(in bytes) is returned. + */ +int get_proc_page_size(unsigned long addr) +{ + FILE *smaps; + char *line; + unsigned long mmupage_size = 0; + size_t size; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) + ksft_exit_fail_msg("Unable to parse /proc/self/smaps\n"); + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, "MMUPageSize")) { + free(line); + line = NULL; + size = 0; + continue; + } + + /* found the MMUPageSize of this section */ + if (sscanf(line, "MMUPageSize: %8lu kB", &mmupage_size) < 1) + ksft_exit_fail_msg("Unable to parse smaps entry for Size:%s\n", + line); + + } + free(line); + if (smaps) + fclose(smaps); + return mmupage_size << 10; +} + +/* + * Test mlock/mlock2() on provided memory chunk. + * It expects the mlock/mlock2() to be successful (within rlimit) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will choose start/len randomly to perform mlock/mlock2 + * [start, start + len] memory range. The range is within range + * of the allocated chunk. + * + * The memory region size alloc_size is within the rlimit. + * So we always expect a success of mlock/mlock2. + * + * VmLck is assumed to be 0 before this test. + * + * return value: 0 - success + * else: failure + */ +static void test_mlock_within_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0; + struct rlimit cur; + int page_size = 0; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur < alloc_size) + ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n", + alloc_size, (unsigned int)cur.rlim_cur); + + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + /* + * - choose mlock/mlock2 randomly + * - choose lock_size randomly but lock_size < alloc_size + * - choose start_offset randomly but p+start_offset+lock_size + * < p+alloc_size + */ + int is_mlock = !!(rand() % 2); + int lock_size = rand() % alloc_size; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + + if (ret) + ksft_exit_fail_msg("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + } + + /* + * Check VmLck left by the tests. + */ + locked_vm_size = get_proc_locked_vm_size(); + page_size = get_proc_page_size((unsigned long)p); + + if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) + ksft_exit_fail_msg("%s left VmLck:%d on %d chunk\n", + __func__, locked_vm_size, alloc_size); + + ksft_test_result_pass("%s\n", __func__); +} + + +/* + * We expect the mlock/mlock2() to be fail (outof limitation) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will randomly choose start/len and perform mlock/mlock2 + * on [start, start+len] range. + * + * The memory region size alloc_size is above the rlimit. + * And the len to be locked is higher than rlimit. + * So we always expect a failure of mlock/mlock2. + * No locked page number should be increased as a side effect. + * + * return value: 0 - success + * else: failure + */ +static void test_mlock_outof_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0, old_locked_vm_size = 0; + struct rlimit cur; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur >= alloc_size) + ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n", + alloc_size, (unsigned int)cur.rlim_cur); + + old_locked_vm_size = get_proc_locked_vm_size(); + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + int is_mlock = !!(rand() % 2); + int lock_size = (rand() % (alloc_size - cur.rlim_cur)) + + cur.rlim_cur; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + if (ret == 0) + ksft_exit_fail_msg("%s() succeeds? on %p(%d) mlock%p(%d)\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, p + start_offset, lock_size); + } + + locked_vm_size = get_proc_locked_vm_size(); + if (locked_vm_size != old_locked_vm_size) + ksft_exit_fail_msg("tests leads to new mlocked page: old[%d], new[%d]\n", + old_locked_vm_size, + locked_vm_size); + + ksft_test_result_pass("%s\n", __func__); +} + +int main(int argc, char **argv) +{ + char *p = NULL; + + ksft_print_header(); + + if (set_cap_limits(MLOCK_RLIMIT_SIZE)) + ksft_finished(); + + ksft_set_plan(2); + + p = malloc(MLOCK_WITHIN_LIMIT_SIZE); + if (p == NULL) + ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno)); + + test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); + munlock(p, MLOCK_WITHIN_LIMIT_SIZE); + free(p); + + p = malloc(MLOCK_OUTOF_LIMIT_SIZE); + if (p == NULL) + ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno)); + + test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); + munlock(p, MLOCK_OUTOF_LIMIT_SIZE); + free(p); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c new file mode 100644 index 000000000000..7f0d50fa361d --- /dev/null +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sys/mman.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <stdbool.h> +#include "../kselftest.h" +#include "mlock2.h" + +struct vm_boundaries { + unsigned long start; + unsigned long end; +}; + +static int get_vm_area(unsigned long addr, struct vm_boundaries *area) +{ + FILE *file; + int ret = 1; + char line[1024] = {0}; + unsigned long start; + unsigned long end; + + if (!area) + return ret; + + file = fopen("/proc/self/maps", "r"); + if (!file) { + perror("fopen"); + return ret; + } + + memset(area, 0, sizeof(struct vm_boundaries)); + + while(fgets(line, 1024, file)) { + if (sscanf(line, "%lx-%lx", &start, &end) != 2) { + ksft_print_msg("cannot parse /proc/self/maps\n"); + goto out; + } + + if (start <= addr && end > addr) { + area->start = start; + area->end = end; + ret = 0; + goto out; + } + } +out: + fclose(file); + return ret; +} + +#define VMFLAGS "VmFlags:" + +static bool is_vmflag_set(unsigned long addr, const char *vmflag) +{ + char *line = NULL; + char *flags; + size_t size = 0; + bool ret = false; + FILE *smaps; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + ksft_print_msg("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, VMFLAGS)) { + free(line); + line = NULL; + size = 0; + continue; + } + + flags = line + strlen(VMFLAGS); + ret = (strstr(flags, vmflag) != NULL); + goto out; + } + +out: + free(line); + fclose(smaps); + return ret; +} + +#define SIZE "Size:" +#define RSS "Rss:" +#define LOCKED "lo" + +static unsigned long get_value_for_name(unsigned long addr, const char *name) +{ + char *line = NULL; + size_t size = 0; + char *value_ptr; + FILE *smaps = NULL; + unsigned long value = -1UL; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + ksft_print_msg("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, name)) { + free(line); + line = NULL; + size = 0; + continue; + } + + value_ptr = line + strlen(name); + if (sscanf(value_ptr, "%lu kB", &value) < 1) { + ksft_print_msg("Unable to parse smaps entry for Size\n"); + goto out; + } + break; + } + +out: + if (smaps) + fclose(smaps); + free(line); + return value; +} + +static bool is_vma_lock_on_fault(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + /* only one page is faulted in */ + return (vma_rss < vma_size); +} + +#define PRESENT_BIT 0x8000000000000000ULL +#define PFN_MASK 0x007FFFFFFFFFFFFFULL +#define UNEVICTABLE_BIT (1UL << 18) + +static int lock_check(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + return (vma_rss == vma_size); +} + +static int unlock_lock_check(char *map) +{ + if (is_vmflag_set((unsigned long)map, LOCKED)) { + ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED); + return 1; + } + + return 0; +} + +static void test_mlock_lock(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + if (mlock2_(map, 2 * page_size, 0)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlock2(0): %s\n", strerror(errno)); + } + + ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__); + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlock(): %s\n", strerror(errno)); + } + + ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__); + munmap(map, 2 * page_size); +} + +static int onfault_check(char *map) +{ + *map = 'a'; + if (!is_vma_lock_on_fault((unsigned long)map)) { + ksft_print_msg("VMA is not marked for lock on fault\n"); + return 1; + } + + return 0; +} + +static int unlock_onfault_check(char *map) +{ + unsigned long page_size = getpagesize(); + + if (is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size)) { + ksft_print_msg("VMA is still lock on fault after unlock\n"); + return 1; + } + + return 0; +} + +static void test_mlock_onfault(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlock2(MLOCK_ONFAULT): %s\n", strerror(errno)); + } + + ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__); + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlock(): %s\n", strerror(errno)); + } + + ksft_test_result(!unlock_onfault_check(map), "VMA open lock after fault\n"); + munmap(map, 2 * page_size); +} + +static void test_lock_onfault_of_present(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + *map = 'a'; + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + munmap(map, 2 * page_size); + ksft_test_result_fail("mlock2(MLOCK_ONFAULT) error: %s", strerror(errno)); + } + + ksft_test_result(is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size), + "VMA with present pages is not marked lock on fault\n"); + munmap(map, 2 * page_size); +} + +static void test_munlockall0(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s\n", strerror(errno)); + + if (mlockall(MCL_CURRENT)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlockall(MCL_CURRENT): %s\n", strerror(errno)); + } + + ksft_test_result(lock_check((unsigned long)map), "%s: Locked memory area\n", __func__); + + if (munlockall()) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno)); + } + + ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); + munmap(map, 2 * page_size); +} + +static void test_munlockall1(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_ONFAULT): %s\n", strerror(errno)); + } + + ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__); + + if (munlockall()) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno)); + } + + ksft_test_result(!unlock_onfault_check(map), "%s: Unlocked\n", __func__); + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno)); + } + + ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__); + + if (munlockall()) { + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlockall() %s\n", strerror(errno)); + } + + ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); + munmap(map, 2 * page_size); +} + +static void test_vma_management(bool call_mlock) +{ + void *map; + unsigned long page_size = getpagesize(); + struct vm_boundaries page1; + struct vm_boundaries page2; + struct vm_boundaries page3; + + map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { + munmap(map, 3 * page_size); + ksft_test_result_fail("mlock error: %s", strerror(errno)); + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + munmap(map, 3 * page_size); + ksft_test_result_fail("couldn't find mapping in /proc/self/maps"); + } + + /* + * Before we unlock a portion, we need to that all three pages are in + * the same VMA. If they are not we abort this test (Note that this is + * not a failure) + */ + if (page1.start != page2.start || page2.start != page3.start) { + munmap(map, 3 * page_size); + ksft_test_result_fail("VMAs are not merged to start, aborting test"); + } + + if (munlock(map + page_size, page_size)) { + munmap(map, 3 * page_size); + ksft_test_result_fail("munlock(): %s", strerror(errno)); + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + munmap(map, 3 * page_size); + ksft_test_result_fail("couldn't find mapping in /proc/self/maps"); + } + + /* All three VMAs should be different */ + if (page1.start == page2.start || page2.start == page3.start) { + munmap(map, 3 * page_size); + ksft_test_result_fail("failed to split VMA for munlock"); + } + + /* Now unlock the first and third page and check the VMAs again */ + if (munlock(map, page_size * 3)) { + munmap(map, 3 * page_size); + ksft_test_result_fail("munlock(): %s", strerror(errno)); + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + munmap(map, 3 * page_size); + ksft_test_result_fail("couldn't find mapping in /proc/self/maps"); + } + + /* Now all three VMAs should be the same */ + if (page1.start != page2.start || page2.start != page3.start) { + munmap(map, 3 * page_size); + ksft_test_result_fail("failed to merge VMAs after munlock"); + } + + ksft_test_result_pass("%s call_mlock %d\n", __func__, call_mlock); + munmap(map, 3 * page_size); +} + +static void test_mlockall(void) +{ + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) + ksft_exit_fail_msg("mlockall failed: %s\n", strerror(errno)); + + test_vma_management(false); + munlockall(); +} + +int main(int argc, char **argv) +{ + int ret, size = 3 * getpagesize(); + void *map; + + ksft_print_header(); + + map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + ret = mlock2_(map, size, MLOCK_ONFAULT); + if (ret && errno == ENOSYS) + ksft_finished(); + + munmap(map, size); + + ksft_set_plan(13); + + test_mlock_lock(); + test_mlock_onfault(); + test_munlockall0(); + test_munlockall1(); + test_lock_onfault_of_present(); + test_vma_management(true); + test_mlockall(); + + ksft_finished(); +} diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/mm/mlock2.h index 2a6e76c226bc..4417eaa5cfb7 100644 --- a/tools/testing/selftests/vm/mlock2.h +++ b/tools/testing/selftests/mm/mlock2.h @@ -4,22 +4,9 @@ #include <stdio.h> #include <stdlib.h> -#ifndef MLOCK_ONFAULT -#define MLOCK_ONFAULT 1 -#endif - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - static int mlock2_(void *start, size_t len, int flags) { -#ifdef __NR_mlock2 return syscall(__NR_mlock2, start, len, flags); -#else - errno = ENOSYS; - return -1; -#endif } static FILE *seek_to_smaps_entry(unsigned long addr) @@ -35,10 +22,8 @@ static FILE *seek_to_smaps_entry(unsigned long addr) char path[BUFSIZ]; file = fopen("/proc/self/smaps", "r"); - if (!file) { - perror("fopen smaps"); - _exit(1); - } + if (!file) + ksft_exit_fail_msg("fopen smaps: %s\n", strerror(errno)); while (getline(&line, &size, file) > 0) { if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c new file mode 100644 index 000000000000..100370a7111d --- /dev/null +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2022 Google LLC + */ +#define _GNU_SOURCE +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <unistd.h> +#include <asm-generic/unistd.h> +#include "vm_util.h" +#include "../kselftest.h" + +#define MB(x) (x << 20) +#define MAX_SIZE_MB 1024 + +static int alloc_noexit(unsigned long nr_pages, int pipefd) +{ + int ppid = getppid(); + int timeout = 10; /* 10sec timeout to get killed */ + unsigned long i; + char *buf; + + buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, 0, 0); + if (buf == MAP_FAILED) + ksft_exit_fail_msg("mmap failed, halting the test: %s\n", strerror(errno)); + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(buf + (i * psize()))) = i; + + /* Signal the parent that the child is ready */ + if (write(pipefd, "", 1) < 0) + ksft_exit_fail_msg("write: %s\n", strerror(errno)); + + /* Wait to be killed (when reparenting happens) */ + while (getppid() == ppid && timeout > 0) { + sleep(1); + timeout--; + } + + munmap(buf, nr_pages * psize()); + + return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; +} + +/* The process_mrelease calls in this test are expected to fail */ +static void run_negative_tests(int pidfd) +{ + /* Test invalid flags. Expect to fail with EINVAL error code. */ + if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || + errno != EINVAL) { + ksft_exit_fail_msg("process_mrelease with wrong flags: %s\n", strerror(errno)); + } + /* + * Test reaping while process is alive with no pending SIGKILL. + * Expect to fail with EINVAL error code. + */ + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) + ksft_exit_fail_msg("process_mrelease on a live process: %s\n", strerror(errno)); +} + +static int child_main(int pipefd[], size_t size) +{ + int res; + + /* Allocate and fault-in memory and wait to be killed */ + close(pipefd[0]); + res = alloc_noexit(MB(size) / psize(), pipefd[1]); + close(pipefd[1]); + return res; +} + +int main(void) +{ + int pipefd[2], pidfd; + bool success, retry; + size_t size; + pid_t pid; + char byte; + int res; + + ksft_print_header(); + ksft_set_plan(1); + + /* Test a wrong pidfd */ + if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + if (errno == ENOSYS) { + ksft_test_result_skip("process_mrelease not implemented\n"); + ksft_finished(); + } else { + ksft_exit_fail_msg("process_mrelease with wrong pidfd: %s", + strerror(errno)); + } + } + + /* Start the test with 1MB child memory allocation */ + size = 1; +retry: + /* + * Pipe for the child to signal when it's done allocating + * memory + */ + if (pipe(pipefd)) + ksft_exit_fail_msg("pipe: %s\n", strerror(errno)); + + pid = fork(); + if (pid < 0) { + close(pipefd[0]); + close(pipefd[1]); + ksft_exit_fail_msg("fork: %s\n", strerror(errno)); + } + + if (pid == 0) { + /* Child main routine */ + res = child_main(pipefd, size); + exit(res); + } + + /* + * Parent main routine: + * Wait for the child to finish allocations, then kill and reap + */ + close(pipefd[1]); + /* Block until the child is ready */ + res = read(pipefd[0], &byte, 1); + close(pipefd[0]); + if (res < 0) { + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + ksft_exit_fail_msg("read: %s\n", strerror(errno)); + } + + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + ksft_exit_fail_msg("pidfd_open: %s\n", strerror(errno)); + } + + /* Run negative tests which require a live child */ + run_negative_tests(pidfd); + + if (kill(pid, SIGKILL)) + ksft_exit_fail_msg("kill: %s\n", strerror(errno)); + + success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); + if (!success) { + /* + * If we failed to reap because the child exited too soon, + * before we could call process_mrelease. Double child's memory + * which causes it to spend more time on cleanup and increases + * our chances of reaping its memory before it exits. + * Retry until we succeed or reach MAX_SIZE_MB. + */ + if (errno == ESRCH) { + retry = (size <= MAX_SIZE_MB); + } else { + waitpid(pid, NULL, 0); + ksft_exit_fail_msg("process_mrelease: %s\n", strerror(errno)); + } + } + + /* Cleanup to prevent zombies */ + if (waitpid(pid, NULL, 0) < 0) + ksft_exit_fail_msg("waitpid: %s\n", strerror(errno)); + + close(pidfd); + + if (!success) { + if (retry) { + size *= 2; + goto retry; + } + ksft_exit_fail_msg("All process_mrelease attempts failed!\n"); + } + + ksft_test_result_pass("Success reaping a child with %zuMB of memory allocations\n", + size); + ksft_finished(); +} diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c index 3a7b5ef0b0c6..1d75084b9ca5 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/mm/mremap_dontunmap.c @@ -7,6 +7,7 @@ */ #define _GNU_SOURCE #include <sys/mman.h> +#include <linux/mman.h> #include <errno.h> #include <stdio.h> #include <stdlib.h> @@ -15,10 +16,6 @@ #include "../kselftest.h" -#ifndef MREMAP_DONTUNMAP -#define MREMAP_DONTUNMAP 4 -#endif - unsigned long page_size; char *page_buffer; @@ -30,14 +27,14 @@ static void dump_maps(void) system(cmd); } -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - dump_maps(); \ - exit(1); \ - } \ +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + dump_maps(); \ + ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \ + __func__, __LINE__, (description), \ + strerror(errno)); \ + } \ } while (0) // Try a simple operation for to "test" for kernel support this prevents @@ -125,6 +122,59 @@ static void mremap_dontunmap_simple() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); +} + +// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. +static void mremap_dontunmap_simple_shmem() +{ + unsigned long num_pages = 5; + + int mem_fd = memfd_create("memfd", MFD_CLOEXEC); + BUG_ON(mem_fd < 0, "memfd_create"); + + BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, + "ftruncate"); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, mem_fd, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + BUG_ON(close(mem_fd) < 0, "close"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + if (dest_mapping == MAP_FAILED && errno == EINVAL) { + // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return; + } + + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // Because the region is backed by shmem, we will actually see the same + // memory at the source location still. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 'a') != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates MREMAP_DONTUNMAP will move page tables to a specific @@ -171,6 +221,7 @@ static void mremap_dontunmap_simple_fixed() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates that we can MREMAP_DONTUNMAP for a portion of an @@ -221,6 +272,7 @@ static void mremap_dontunmap_partial_mapping() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates that we can remap over only a portion of a mapping. @@ -280,19 +332,24 @@ static void mremap_dontunmap_partial_mapping_overwrite(void) "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, 5 * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } int main(void) { + ksft_print_header(); + page_size = sysconf(_SC_PAGE_SIZE); // test for kernel support for MREMAP_DONTUNMAP skipping the test if // not. if (kernel_support_for_mremap_dontunmap() != 0) { - printf("No kernel support for MREMAP_DONTUNMAP\n"); - return KSFT_SKIP; + ksft_print_msg("No kernel support for MREMAP_DONTUNMAP\n"); + ksft_finished(); } + ksft_set_plan(5); + // Keep a page sized buffer around for when we need it. page_buffer = mmap(NULL, page_size, PROT_READ | PROT_WRITE, @@ -300,6 +357,7 @@ int main(void) BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); mremap_dontunmap_simple(); + mremap_dontunmap_simple_shmem(); mremap_dontunmap_simple_fixed(); mremap_dontunmap_partial_mapping(); mremap_dontunmap_partial_mapping_overwrite(); @@ -307,6 +365,5 @@ int main(void) BUG_ON(munmap(page_buffer, page_size) == -1, "unable to unmap page buffer"); - printf("OK\n"); - return 0; + ksft_finished(); } diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c new file mode 100644 index 000000000000..1b03bcfaefdf --- /dev/null +++ b/tools/testing/selftests/mm/mremap_test.c @@ -0,0 +1,864 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#define _GNU_SOURCE + +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> +#include <stdbool.h> + +#include "../kselftest.h" + +#define EXPECT_SUCCESS 0 +#define EXPECT_FAILURE 1 +#define NON_OVERLAPPING 0 +#define OVERLAPPING 1 +#define NS_PER_SEC 1000000000ULL +#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ +#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ + +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) +#define SIZE_MB(m) ((size_t)m * (1024 * 1024)) +#define SIZE_KB(k) ((size_t)k * 1024) + +struct config { + unsigned long long src_alignment; + unsigned long long dest_alignment; + unsigned long long region_size; + int overlapping; + int dest_preamble_size; +}; + +struct test { + const char *name; + struct config config; + int expect_failure; +}; + +enum { + _1KB = 1ULL << 10, /* 1KB -> not page aligned */ + _4KB = 4ULL << 10, + _8KB = 8ULL << 10, + _1MB = 1ULL << 20, + _2MB = 2ULL << 20, + _4MB = 4ULL << 20, + _5MB = 5ULL << 20, + _1GB = 1ULL << 30, + _2GB = 2ULL << 30, + PMD = _2MB, + PUD = _1GB, +}; + +#define PTE page_size + +#define MAKE_TEST(source_align, destination_align, size, \ + overlaps, should_fail, test_name) \ +(struct test){ \ + .name = test_name, \ + .config = { \ + .src_alignment = source_align, \ + .dest_alignment = destination_align, \ + .region_size = size, \ + .overlapping = overlaps, \ + }, \ + .expect_failure = should_fail \ +} + +/* compute square root using binary search */ +static unsigned long get_sqrt(unsigned long val) +{ + unsigned long low = 1; + + /* assuming rand_size is less than 1TB */ + unsigned long high = (1UL << 20); + + while (low <= high) { + unsigned long mid = low + (high - low) / 2; + unsigned long temp = mid * mid; + + if (temp == val) + return mid; + if (temp < val) + low = mid + 1; + high = mid - 1; + } + return low; +} + +/* + * Returns false if the requested remap region overlaps with an + * existing mapping (e.g text, stack) else returns true. + */ +static bool is_remap_region_valid(void *addr, unsigned long long size) +{ + void *remap_addr = NULL; + bool ret = true; + + /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ + remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + + if (remap_addr == MAP_FAILED) { + if (errno == EEXIST) + ret = false; + } else { + munmap(remap_addr, size); + } + + return ret; +} + +/* Returns mmap_min_addr sysctl tunable from procfs */ +static unsigned long long get_mmap_min_addr(void) +{ + FILE *fp; + int n_matched; + static unsigned long long addr; + + if (addr) + return addr; + + fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); + if (fp == NULL) { + ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + exit(KSFT_SKIP); + } + + n_matched = fscanf(fp, "%llu", &addr); + if (n_matched != 1) { + ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + fclose(fp); + exit(KSFT_SKIP); + } + + fclose(fp); + return addr; +} + +/* + * Using /proc/self/maps, assert that the specified address range is contained + * within a single mapping. + */ +static bool is_range_mapped(FILE *maps_fp, unsigned long start, + unsigned long end) +{ + char *line = NULL; + size_t len = 0; + bool success = false; + unsigned long first_val, second_val; + + rewind(maps_fp); + + while (getline(&line, &len, maps_fp) != -1) { + if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) { + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + break; + } + + if (first_val <= start && second_val >= end) { + success = true; + break; + } + } + + return success; +} + +/* + * Returns the start address of the mapping on success, else returns + * NULL on failure. + */ +static void *get_source_mapping(struct config c) +{ + unsigned long long addr = 0ULL; + void *src_addr = NULL; + unsigned long long mmap_min_addr; + + mmap_min_addr = get_mmap_min_addr(); + /* + * For some tests, we need to not have any mappings below the + * source mapping. Add some headroom to mmap_min_addr for this. + */ + mmap_min_addr += 10 * _4MB; + +retry: + addr += c.src_alignment; + if (addr < mmap_min_addr) + goto retry; + + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (src_addr == MAP_FAILED) { + if (errno == EPERM || errno == EEXIST) + goto retry; + goto error; + } + /* + * Check that the address is aligned to the specified alignment. + * Addresses which have alignments that are multiples of that + * specified are not considered valid. For instance, 1GB address is + * 2MB-aligned, however it will not be considered valid for a + * requested alignment of 2MB. This is done to reduce coincidental + * alignment in the tests. + */ + if (((unsigned long long) src_addr & (c.src_alignment - 1)) || + !((unsigned long long) src_addr & c.src_alignment)) { + munmap(src_addr, c.region_size); + goto retry; + } + + if (!src_addr) + goto error; + + return src_addr; +error: + ksft_print_msg("Failed to map source region: %s\n", + strerror(errno)); + return NULL; +} + +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + munmap(start + page_size, page_size); + remap = mremap(start, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, page_size); + munmap(start + 2 * page_size, page_size); + goto out; + } + + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); + munmap(start, 3 * page_size); + +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Similar to mremap_expand_merge() except instead of removing the middle page, + * we remove the last then attempt to remap offset from the second page. This + * should result in the mapping being restored to its former state. + */ +static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size) +{ + + char *test_name = "mremap expand merge offset"; + bool success = false; + char *remap, *start; + + start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + ksft_print_msg("mmap failed: %s\n", strerror(errno)); + goto out; + } + + /* Unmap final page to ensure we have space to expand. */ + munmap(start + 2 * page_size, page_size); + remap = mremap(start + page_size, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + munmap(start, 2 * page_size); + goto out; + } + + success = is_range_mapped(maps_fp, (unsigned long)start, + (unsigned long)(start + 3 * page_size)); + munmap(start, 3 * page_size); + +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Verify that an mremap within a range does not cause corruption + * of unrelated part of range. + * + * Consider the following range which is 2MB aligned and is + * a part of a larger 20MB range which is not shown. Each + * character is 256KB below making the source and destination + * 2MB each. The lower case letters are moved (s to d) and the + * upper case letters are not moved. The below test verifies + * that the upper case S letters are not corrupted by the + * adjacent mremap. + * + * |DDDDddddSSSSssss| + */ +static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr) +{ + char *test_name = "mremap mremap move within range"; + void *src, *dest; + int i, success = 1; + + size_t size = SIZE_MB(20); + void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + success = 0; + goto out; + } + memset(ptr, 0, size); + + src = ptr + SIZE_MB(6); + src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1)); + + /* Set byte pattern for source block. */ + memcpy(src, rand_addr, SIZE_MB(2)); + + dest = src - SIZE_MB(2); + + void *new_ptr = mremap(src + SIZE_MB(1), SIZE_MB(1), SIZE_MB(1), + MREMAP_MAYMOVE | MREMAP_FIXED, dest + SIZE_MB(1)); + if (new_ptr == MAP_FAILED) { + perror("mremap"); + success = 0; + goto out; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < SIZE_MB(1); i++) { + char c = (char) rand(); + + if (((char *)src)[i] != c) { + ksft_print_msg("Data at src at %d got corrupted due to unrelated mremap\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) src)[i] & 0xff); + success = 0; + } + } + +out: + if (munmap(ptr, size) == -1) + perror("munmap"); + + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* Returns the time taken for the remap on success else returns -1. */ +static long long remap_region(struct config c, unsigned int threshold_mb, + char *rand_addr) +{ + void *addr, *src_addr, *dest_addr, *dest_preamble_addr; + unsigned long long t, d; + struct timespec t_start = {0, 0}, t_end = {0, 0}; + long long start_ns, end_ns, align_mask, ret, offset; + unsigned long long threshold; + unsigned long num_chunks; + + if (threshold_mb == VALIDATION_NO_THRESHOLD) + threshold = c.region_size; + else + threshold = MIN(threshold_mb * _1MB, c.region_size); + + src_addr = get_source_mapping(c); + if (!src_addr) { + ret = -1; + goto out; + } + + /* Set byte pattern for source block. */ + memcpy(src_addr, rand_addr, threshold); + + /* Mask to zero out lower bits of address for alignment */ + align_mask = ~(c.dest_alignment - 1); + /* Offset of destination address from the end of the source region */ + offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; + addr = (void *) (((unsigned long long) src_addr + c.region_size + + offset) & align_mask); + + /* Remap after the destination block preamble. */ + addr += c.dest_preamble_size; + + /* See comment in get_source_mapping() */ + if (!((unsigned long long) addr & c.dest_alignment)) + addr = (void *) ((unsigned long long) addr | c.dest_alignment); + + /* Don't destroy existing mappings unless expected to overlap */ + while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { + /* Check for unsigned overflow */ + if (addr + c.dest_alignment < addr) { + ksft_print_msg("Couldn't find a valid region to remap to\n"); + ret = -1; + goto clean_up_src; + } + addr += c.dest_alignment; + } + + if (c.dest_preamble_size) { + dest_preamble_addr = mmap((void *) addr - c.dest_preamble_size, c.dest_preamble_size, + PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (dest_preamble_addr == MAP_FAILED) { + ksft_print_msg("Failed to map dest preamble region: %s\n", + strerror(errno)); + ret = -1; + goto clean_up_src; + } + + /* Set byte pattern for the dest preamble block. */ + memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size); + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + dest_addr = mremap(src_addr, c.region_size, c.region_size, + MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + if (dest_addr == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + ret = -1; + goto clean_up_dest_preamble; + } + + /* + * Verify byte pattern after remapping. Employ an algorithm with a + * square root time complexity in threshold: divide the range into + * chunks, if memcmp() returns non-zero, only then perform an + * iteration in that chunk to find the mismatch index. + */ + num_chunks = get_sqrt(threshold); + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = threshold / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) + continue; + + /* brute force iteration only over mismatch segment */ + for (t = shift; t < shift + chunk_size; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { + ksft_print_msg("Data after remap doesn't match at offset %llu\n", + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, + ((char *) dest_addr)[t] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } + + /* + * if threshold is not divisible by num_chunks, then check the + * last chunk + */ + for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { + if (((char *) dest_addr)[t] != rand_addr[t]) { + ksft_print_msg("Data after remap doesn't match at offset %llu\n", + t); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, + ((char *) dest_addr)[t] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + + /* Verify the dest preamble byte pattern after remapping */ + if (!c.dest_preamble_size) + goto no_preamble; + + num_chunks = get_sqrt(c.dest_preamble_size); + + for (unsigned long i = 0; i < num_chunks; ++i) { + size_t chunk_size = c.dest_preamble_size / num_chunks; + unsigned long shift = i * chunk_size; + + if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, + chunk_size)) + continue; + + /* brute force iteration only over mismatched segment */ + for (d = shift; d < shift + chunk_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } + + for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { + if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { + ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", + d); + ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, + ((char *) dest_preamble_addr)[d] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + +no_preamble: + start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; + end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; + ret = end_ns - start_ns; + +/* + * Since the destination address is specified using MREMAP_FIXED, subsequent + * mremap will unmap any previous mapping at the address range specified by + * dest_addr and region_size. This significantly affects the remap time of + * subsequent tests. So we clean up mappings after each test. + */ +clean_up_dest: + munmap(dest_addr, c.region_size); +clean_up_dest_preamble: + if (c.dest_preamble_size && dest_preamble_addr) + munmap(dest_preamble_addr, c.dest_preamble_size); +clean_up_src: + munmap(src_addr, c.region_size); +out: + return ret; +} + +/* + * Verify that an mremap aligning down does not destroy + * the beginning of the mapping just because the aligned + * down address landed on a mapping that maybe does not exist. + */ +static void mremap_move_1mb_from_start(unsigned int pattern_seed, + char *rand_addr) +{ + char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; + void *src = NULL, *dest = NULL; + int i, success = 1; + + /* Config to reuse get_source_mapping() to do an aligned mmap. */ + struct config c = { + .src_alignment = SIZE_MB(1) + SIZE_KB(256), + .region_size = SIZE_MB(6) + }; + + src = get_source_mapping(c); + if (!src) { + success = 0; + goto out; + } + + c.src_alignment = SIZE_MB(1) + SIZE_KB(256); + dest = get_source_mapping(c); + if (!dest) { + success = 0; + goto out; + } + + /* Set byte pattern for source block. */ + memcpy(src, rand_addr, SIZE_MB(2)); + + /* + * Unmap the beginning of dest so that the aligned address + * falls on no mapping. + */ + munmap(dest, SIZE_MB(1)); + + void *new_ptr = mremap(src + SIZE_MB(1), SIZE_MB(1), SIZE_MB(1), + MREMAP_MAYMOVE | MREMAP_FIXED, dest + SIZE_MB(1)); + if (new_ptr == MAP_FAILED) { + perror("mremap"); + success = 0; + goto out; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < SIZE_MB(1); i++) { + char c = (char) rand(); + + if (((char *)src)[i] != c) { + ksft_print_msg("Data at src at %d got corrupted due to unrelated mremap\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) src)[i] & 0xff); + success = 0; + } + } + +out: + if (src && munmap(src, c.region_size) == -1) + perror("munmap src"); + + if (dest && munmap(dest, c.region_size) == -1) + perror("munmap dest"); + + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +static void run_mremap_test_case(struct test test_case, int *failures, + unsigned int threshold_mb, + unsigned int pattern_seed, char *rand_addr) +{ + long long remap_time = remap_region(test_case.config, threshold_mb, + rand_addr); + + if (remap_time < 0) { + if (test_case.expect_failure) + ksft_test_result_xfail("%s\n\tExpected mremap failure\n", + test_case.name); + else { + ksft_test_result_fail("%s\n", test_case.name); + *failures += 1; + } + } else { + /* + * Comparing mremap time is only applicable if entire region + * was faulted in. + */ + if (threshold_mb == VALIDATION_NO_THRESHOLD || + test_case.config.region_size <= threshold_mb * _1MB) + ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", + test_case.name, remap_time); + else + ksft_test_result_pass("%s\n", test_case.name); + } +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n" + "-t\t only validate threshold_mb of the remapped region\n" + " \t if 0 is supplied no threshold is used; all tests\n" + " \t are run and remapped regions validated fully.\n" + " \t The default threshold used is 4MB.\n" + "-p\t provide a seed to generate the random pattern for\n" + " \t validating the remapped region.\n", cmd); +} + +static int parse_args(int argc, char **argv, unsigned int *threshold_mb, + unsigned int *pattern_seed) +{ + const char *optstr = "t:p:"; + int opt; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 't': + *threshold_mb = atoi(optarg); + break; + case 'p': + *pattern_seed = atoi(optarg); + break; + default: + usage(argv[0]); + return -1; + } + } + + if (optind < argc) { + usage(argv[0]); + return -1; + } + + return 0; +} + +#define MAX_TEST 15 +#define MAX_PERF_TEST 3 +int main(int argc, char **argv) +{ + int failures = 0; + int i, run_perf_tests; + unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + + /* hard-coded test configs */ + size_t max_test_variable_region_size = _2GB; + size_t max_test_constant_region_size = _2MB; + size_t dest_preamble_size = 10 * _4MB; + + unsigned int pattern_seed; + char *rand_addr; + size_t rand_size; + int num_expand_tests = 2; + int num_misc_tests = 2; + struct test test_cases[MAX_TEST] = {}; + struct test perf_test_cases[MAX_PERF_TEST]; + int page_size; + time_t t; + FILE *maps_fp; + + pattern_seed = (unsigned int) time(&t); + + if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) + exit(EXIT_FAILURE); + + ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", + threshold_mb, pattern_seed); + + /* + * set preallocated random array according to test configs; see the + * functions for the logic of setting the size + */ + if (!threshold_mb) + rand_size = MAX(max_test_variable_region_size, + max_test_constant_region_size); + else + rand_size = MAX(MIN(threshold_mb * _1MB, + max_test_variable_region_size), + max_test_constant_region_size); + rand_size = MAX(dest_preamble_size, rand_size); + + rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (rand_addr == MAP_FAILED) { + perror("mmap"); + ksft_exit_fail_msg("cannot mmap rand_addr\n"); + } + + /* fill stream of random bytes */ + srand(pattern_seed); + for (unsigned long i = 0; i < rand_size; ++i) + rand_addr[i] = (char) rand(); + + page_size = sysconf(_SC_PAGESIZE); + + /* Expected mremap failures */ + test_cases[0] = MAKE_TEST(page_size, page_size, page_size, + OVERLAPPING, EXPECT_FAILURE, + "mremap - Source and Destination Regions Overlapping"); + + test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Destination Address Misaligned (1KB-aligned)"); + test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Source Address Misaligned (1KB-aligned)"); + + /* Src addr PTE aligned */ + test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, + NON_OVERLAPPING, EXPECT_SUCCESS, + "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); + + /* Src addr 1MB aligned */ + test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); + test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + + /* Src addr PMD aligned */ + test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); + test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); + test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); + + /* Src addr PUD aligned */ + test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); + test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); + test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); + test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + /* Src and Dest addr 1MB aligned. 5MB mremap. */ + test_cases[13] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "5MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + + /* Src and Dest addr 1MB aligned. 5MB mremap. */ + test_cases[14] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "5MB mremap - Source 1MB-aligned, Dest 1MB-aligned with 40MB Preamble"); + test_cases[14].config.dest_preamble_size = 10 * _4MB; + + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); + /* + * mremap 1GB region - Page table level aligned time + * comparison. + */ + perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); + perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || + (threshold_mb * _1MB >= _1GB); + + ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests + num_misc_tests); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_mremap_test_case(test_cases[i], &failures, threshold_mb, + pattern_seed, rand_addr); + + maps_fp = fopen("/proc/self/maps", "r"); + + if (maps_fp == NULL) { + munmap(rand_addr, rand_size); + ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno)); + } + + mremap_expand_merge(maps_fp, page_size); + mremap_expand_merge_offset(maps_fp, page_size); + + fclose(maps_fp); + + mremap_move_within_range(pattern_seed, rand_addr); + mremap_move_1mb_from_start(pattern_seed, rand_addr); + + if (run_perf_tests) { + ksft_print_msg("\n%s\n", + "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); + for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) + run_mremap_test_case(perf_test_cases[i], &failures, + threshold_mb, pattern_seed, + rand_addr); + } + + munmap(rand_addr, rand_size); + + if (failures > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c new file mode 100644 index 000000000000..41998cf1dcf5 --- /dev/null +++ b/tools/testing/selftests/mm/mseal_test.c @@ -0,0 +1,1894 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <linux/mman.h> +#include <sys/mman.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <stdbool.h> +#include "../kselftest.h" +#include <syscall.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/vfs.h> +#include <sys/stat.h> + +/* + * need those definition for manually build using gcc. + * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 mseal_test.c -o mseal_test + */ +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#ifndef PKEY_BITS_PER_PKEY +#define PKEY_BITS_PER_PKEY 2 +#endif + +#ifndef PKEY_MASK +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) +#endif + +#define FAIL_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + +#define SKIP_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + + +#define TEST_END_CHECK() {\ + ksft_test_result_pass("%s\n", __func__);\ + return;\ +test_end:\ + return;\ +} + +#ifndef u64 +#define u64 unsigned long long +#endif + +static unsigned long get_vma_size(void *addr, int *prot) +{ + FILE *maps; + char line[256]; + int size = 0; + uintptr_t addr_start, addr_end; + char protstr[5]; + *prot = 0; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) + return 0; + + while (fgets(line, sizeof(line), maps)) { + if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, protstr) == 3) { + if (addr_start == (uintptr_t) addr) { + size = addr_end - addr_start; + if (protstr[0] == 'r') + *prot |= 0x4; + if (protstr[1] == 'w') + *prot |= 0x2; + if (protstr[2] == 'x') + *prot |= 0x1; + break; + } + } + } + fclose(maps); + return size; +} + +/* + * define sys_xyx to call syscall directly. + */ +static int sys_mseal(void *start, size_t len) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mseal, start, len, 0); + return sret; +} + +static int sys_mprotect(void *ptr, size_t size, unsigned long prot) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mprotect, ptr, size, prot); + return sret; +} + +static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + errno = 0; + sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); + return sret; +} + +static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mmap, addr, len, prot, + flags, fd, offset); + return sret; +} + +static int sys_munmap(void *ptr, size_t size) +{ + int sret; + + errno = 0; + sret = syscall(__NR_munmap, ptr, size); + return sret; +} + +static int sys_madvise(void *start, size_t len, int types) +{ + int sret; + + errno = 0; + sret = syscall(__NR_madvise, start, len, types); + return sret; +} + +static int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(__NR_pkey_alloc, flags, init_val); + + return ret; +} + +static unsigned int __read_pkey_reg(void) +{ + unsigned int pkey_reg = 0; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax, edx; + unsigned int ecx = 0; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; +#endif + return pkey_reg; +} + +static void __write_pkey_reg(u64 pkey_reg) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); +#endif +} + +static unsigned long pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +static u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + unsigned long shift = pkey_bit_position(pkey); + + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static void set_pkey(int pkey, unsigned long pkey_value) +{ + u64 new_pkey_reg; + + new_pkey_reg = set_pkey_bits(__read_pkey_reg(), pkey, pkey_value); + __write_pkey_reg(new_pkey_reg); +} + +static void setup_single_address(int size, void **ptrOut) +{ + void *ptr; + + ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + *ptrOut = ptr; +} + +static void setup_single_address_rw(int size, void **ptrOut) +{ + void *ptr; + unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE; + + ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0); + *ptrOut = ptr; +} + +static int clean_single_address(void *ptr, int size) +{ + int ret; + ret = munmap(ptr, size); + return ret; +} + +static int seal_single_address(void *ptr, int size) +{ + int ret; + ret = sys_mseal(ptr, size); + return ret; +} + +bool seal_support(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + + ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == (void *) -1) + return false; + + ret = sys_mseal(ptr, page_size); + if (ret < 0) + return false; + + return true; +} + +bool pkey_supported(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + int pkey = sys_pkey_alloc(0, 0); + + if (pkey > 0) + return true; +#endif + return false; +} + +static void test_seal_addseal(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_start(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* munmap 2 pages from ptr. */ + ret = sys_munmap(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail because 2 pages from ptr are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail because 2 pages from ptr are unmapped. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_middle(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* munmap 2 pages from ptr + page. */ + ret = sys_munmap(ptr + page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail, since middle 2 pages are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* we still can add seal to the first page and last page*/ + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mseal(ptr + 3 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_unmapped_end(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap last 2 pages. */ + ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail since last 2 pages are unmapped. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* The first 2 pages is not sealed, and can add seals */ + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_multiple_vmas(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split the vma into 3. */ + ret = sys_mprotect(ptr + page_size, 2 * page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will get applied to all 4 pages - 3 VMAs. */ + ret = sys_mprotect(ptr, size, PROT_READ); + FAIL_TEST_IF_FALSE(!ret); + + /* use mprotect to split the vma into 3. */ + ret = sys_mprotect(ptr + page_size, 2 * page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mseal get applied to all 4 pages - 3 VMAs. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_split_start(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split at middle */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first page, this will split the VMA */ + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* add seal to the remain 3 pages */ + ret = sys_mseal(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_split_end(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split at middle */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the last page */ + ret = sys_mseal(ptr + 3 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* Adding seals to the first 3 pages */ + ret = sys_mseal(ptr, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_invalid_input(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(8 * page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + ret = clean_single_address(ptr + 4 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* invalid flag */ + ret = syscall(__NR_mseal, ptr, size, 0x20); + FAIL_TEST_IF_FALSE(ret < 0); + + /* unaligned address */ + ret = sys_mseal(ptr + 1, 2 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* length too big */ + ret = sys_mseal(ptr, 5 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* length overflow */ + ret = sys_mseal(ptr, UINT64_MAX/page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* start is not in a valid VMA */ + ret = sys_mseal(ptr - page_size, 5 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + TEST_END_CHECK(); +} + +static void test_seal_zero_length(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_mprotect(ptr, 0, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal 0 length will be OK, same as mprotect */ + ret = sys_mseal(ptr, 0); + FAIL_TEST_IF_FALSE(!ret); + + /* verify the 4 pages are not sealed by previous call. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_zero_address(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + /* use mmap to change protection. */ + ptr = sys_mmap(0, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + FAIL_TEST_IF_FALSE(ptr == 0); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == 4 * page_size); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + /* verify the 4 pages are sealed by previous call. */ + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret); + + TEST_END_CHECK(); +} + +static void test_seal_twice(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + /* apply the same seal will be OK. idempotent. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_start_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* the first page is sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* pages after the first page is not sealed. */ + ret = sys_mprotect(ptr + page_size, page_size * 3, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_end_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* first page is not sealed */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* last 3 page are sealed */ + ret = sys_mprotect(ptr + page_size, page_size * 3, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_unalign_len(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, page_size * 2 - 1); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 2 pages are sealed. */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 2, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_unalign_len_variant_2(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + if (seal) { + ret = seal_single_address(ptr, page_size * 2 + 1); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 3 pages are sealed. */ + ret = sys_mprotect(ptr, page_size * 3, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 3, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = seal_single_address(ptr, page_size * 4); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_mprotect(ptr + page_size * 2, page_size * 2, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma_with_split(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split as two vma. */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* mseal can apply across 2 vma, also split them. */ + if (seal) { + ret = seal_single_address(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + } + + /* the first page is not sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* the second page is sealed. */ + ret = sys_mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* the third page is sealed. */ + ret = sys_mprotect(ptr + 2 * page_size, page_size, + PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* the fouth page is not sealed. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_partial_mprotect(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* seal one page. */ + if (seal) { + ret = seal_single_address(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mprotect first 2 page will fail, since the first page are sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_two_vma_with_gap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, + PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* use munmap to free two pages in the middle */ + ret = sys_munmap(ptr + page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* mprotect will fail, because there is a gap in the address. */ + /* notes, internally mprotect still updated the first page. */ + ret = sys_mprotect(ptr, 4 * page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret < 0); + + /* mseal will fail as well. */ + ret = sys_mseal(ptr, 4 * page_size); + FAIL_TEST_IF_FALSE(ret < 0); + + /* the first page is not sealed. */ + ret = sys_mprotect(ptr, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + /* the last page is not sealed. */ + ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_split(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal all 4 pages. */ + if (seal) { + ret = sys_mseal(ptr, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mprotect is sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + + ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_mprotect_merge(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split one page. */ + ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + /* seal first two pages. */ + if (seal) { + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 2 pages are sealed. */ + ret = sys_mprotect(ptr, 2 * page_size, PROT_READ); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* last 2 pages are not sealed. */ + ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ); + FAIL_TEST_IF_FALSE(ret == 0); + + TEST_END_CHECK(); +} + +static void test_seal_munmap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* 4 pages are sealed. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +/* + * allocate 4 pages, + * use mprotect to split it as two VMAs + * seal the whole range + * munmap will fail on both + */ +static void test_seal_munmap_two_vma(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect to split */ + ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_munmap(ptr, page_size * 2); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr + page_size, page_size * 2); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +/* + * allocate a VMA with 4 pages. + * munmap the middle 2 pages. + * seal the whole 4 pages, will fail. + * munmap the first page will be OK. + * munmap the last page will be OK. + */ +static void test_seal_munmap_vma_with_gap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + ret = sys_munmap(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + /* can't have gap in the middle. */ + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(ret < 0); + } + + ret = sys_munmap(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr + page_size * 2, page_size); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_munmap_start_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap the first page. */ + ret = sys_munmap(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the last 3 pages. */ + if (seal) { + ret = sys_mseal(ptr + page_size, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* unmap from the first page. */ + ret = sys_munmap(ptr, size); + if (seal) { + FAIL_TEST_IF_FALSE(ret < 0); + + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size * 3); + } else { + /* note: this will be OK, even the first page is */ + /* already unmapped. */ + FAIL_TEST_IF_FALSE(!ret); + + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 0); + } + + TEST_END_CHECK(); +} + +static void test_munmap_end_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap last page. */ + ret = sys_munmap(ptr + page_size * 3, page_size); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first 3 pages. */ + if (seal) { + ret = sys_mseal(ptr, 3 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* unmap all pages. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_munmap_middle_freed(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int prot; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* unmap 2 pages in the middle. */ + ret = sys_munmap(ptr + page_size, page_size * 2); + FAIL_TEST_IF_FALSE(!ret); + + /* seal the first page. */ + if (seal) { + ret = sys_mseal(ptr, page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* munmap all 4 pages. */ + ret = sys_munmap(ptr, size); + if (seal) { + FAIL_TEST_IF_FALSE(ret < 0); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + + size = get_vma_size(ptr + page_size * 3, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + } else { + FAIL_TEST_IF_FALSE(!ret); + + size = get_vma_size(ptr, &prot); + FAIL_TEST_IF_FALSE(size == 0); + + size = get_vma_size(ptr + page_size * 3, &prot); + FAIL_TEST_IF_FALSE(size == 0); + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_shrink(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* shrink from 4 pages to 2 pages. */ + ret2 = mremap(ptr, size, 2 * page_size, 0, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_expand(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + /* ummap last 2 pages. */ + ret = sys_munmap(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* expand from 2 page to 4 pages. */ + ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 == ptr); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move(bool seal) +{ + void *ptr, *newPtr; + unsigned long page_size = getpagesize(); + unsigned long size = page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newPtr); + FAIL_TEST_IF_FALSE(newPtr != (void *)-1); + ret = clean_single_address(newPtr, size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* move from ptr to fixed address. */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mmap_overwrite_prot(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to change protection. */ + ret2 = sys_mmap(ptr, size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mmap_expand(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 12 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + /* ummap last 4 pages. */ + ret = sys_munmap(ptr + 8 * page_size, 4 * page_size); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, 8 * page_size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to expand. */ + ret2 = sys_mmap(ptr, size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mmap_shrink(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 12 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* use mmap to shrink. */ + ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == ptr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_shrink_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move and shrink to fixed address */ + ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_expand_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); + + if (seal) { + ret = sys_mseal(newAddr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move and expand to fixed address */ + ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED, + newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_fixed(bool seal) +{ + void *ptr; + void *newAddr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + setup_single_address(size, &newAddr); + FAIL_TEST_IF_FALSE(newAddr != (void *)-1); + + if (seal) { + ret = sys_mseal(newAddr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move to fixed address */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else + FAIL_TEST_IF_FALSE(ret2 == newAddr); + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_fixed_zero(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * MREMAP_FIXED can move the mapping to zero address + */ + ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 == 0); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_dontunmap(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* mremap to move, and don't unmap src addr. */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + + } + + TEST_END_CHECK(); +} + +static void test_seal_mremap_move_dontunmap_anyaddr(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + void *ret2; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* + * The 0xdeaddead should not have effect on dest addr + * when MREMAP_DONTUNMAP is set. + */ + ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, + 0xdeaddead); + if (seal) { + FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED); + FAIL_TEST_IF_FALSE(errno == EPERM); + } else { + FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED); + FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead); + + } + + TEST_END_CHECK(); +} + + +static void test_seal_merge_and_split(void) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size; + int ret; + int prot; + + /* (24 RO) */ + setup_single_address(24 * page_size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + /* use mprotect(NONE) to set out boundary */ + /* (1 NONE) (22 RO) (1 NONE) */ + ret = sys_mprotect(ptr, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(!ret); + ret = sys_mprotect(ptr + 23 * page_size, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 22 * page_size); + FAIL_TEST_IF_FALSE(prot == 4); + + /* use mseal to split from beginning */ + /* (1 NONE) (1 RO_SEAL) (21 RO) (1 NONE) */ + ret = sys_mseal(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 2 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 21 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* use mseal to split from the end. */ + /* (1 NONE) (1 RO_SEAL) (20 RO) (1 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 22 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 22 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 2 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 20 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge with prev. */ + /* (1 NONE) (2 RO_SEAL) (19 RO) (1 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 2 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge with after. */ + /* (1 NONE) (2 RO_SEAL) (18 RO) (2 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 21 * page_size, page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 21 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 2 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* split and merge from prev */ + /* (1 NONE) (3 RO_SEAL) (17 RO) (2 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + 1 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 3 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + ret = sys_munmap(ptr + page_size, page_size); + FAIL_TEST_IF_FALSE(ret < 0); + ret = sys_mprotect(ptr + 2 * page_size, page_size, PROT_NONE); + FAIL_TEST_IF_FALSE(ret < 0); + + /* split and merge from next */ + /* (1 NONE) (3 RO_SEAL) (16 RO) (3 RO_SEALS) (1 NONE) */ + ret = sys_mseal(ptr + 20 * page_size, 2 * page_size); + FAIL_TEST_IF_FALSE(!ret); + FAIL_TEST_IF_FALSE(prot == 0x4); + size = get_vma_size(ptr + 20 * page_size, &prot); + FAIL_TEST_IF_FALSE(size == 3 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + /* merge from middle of prev and middle of next. */ + /* (1 NONE) (22 RO_SEAL) (1 NONE) */ + ret = sys_mseal(ptr + 2 * page_size, 20 * page_size); + FAIL_TEST_IF_FALSE(!ret); + size = get_vma_size(ptr + page_size, &prot); + FAIL_TEST_IF_FALSE(size == 22 * page_size); + FAIL_TEST_IF_FALSE(prot == 0x4); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_rw(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address_rw(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't take effect on RW memory. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + /* base seal still apply. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_pkey(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int pkey; + + SKIP_TEST_IF_FALSE(pkey_supported()); + + setup_single_address_rw(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + pkey = sys_pkey_alloc(0, 0); + FAIL_TEST_IF_FALSE(pkey > 0); + + ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey); + FAIL_TEST_IF_FALSE(!ret); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't take effect if PKRU allow write. */ + set_pkey(pkey, 0); + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + /* sealing will take effect if PKRU deny write. */ + set_pkey(pkey, PKEY_DISABLE_WRITE); + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + /* base seal still apply. */ + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_filebacked(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + int fd; + unsigned long mapflags = MAP_PRIVATE; + + fd = memfd_create("test", 0); + FAIL_TEST_IF_FALSE(fd > 0); + + ret = fallocate(fd, 0, 0, size); + FAIL_TEST_IF_FALSE(!ret); + + ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0); + FAIL_TEST_IF_FALSE(ptr != MAP_FAILED); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't apply for file backed mapping. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + close(fd); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon_on_shared(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED; + + ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = sys_mseal(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + /* sealing doesn't apply for shared mapping. */ + ret = sys_madvise(ptr, size, MADV_DONTNEED); + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +static void test_seal_discard_ro_anon(bool seal) +{ + void *ptr; + unsigned long page_size = getpagesize(); + unsigned long size = 4 * page_size; + int ret; + + setup_single_address(size, &ptr); + FAIL_TEST_IF_FALSE(ptr != (void *)-1); + + if (seal) { + ret = seal_single_address(ptr, size); + FAIL_TEST_IF_FALSE(!ret); + } + + ret = sys_madvise(ptr, size, MADV_DONTNEED); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + ret = sys_munmap(ptr, size); + if (seal) + FAIL_TEST_IF_FALSE(ret < 0); + else + FAIL_TEST_IF_FALSE(!ret); + + TEST_END_CHECK(); +} + +int main(int argc, char **argv) +{ + bool test_seal = seal_support(); + + ksft_print_header(); + + if (!test_seal) + ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); + + if (!pkey_supported()) + ksft_print_msg("PKEY not supported\n"); + + ksft_set_plan(80); + + test_seal_addseal(); + test_seal_unmapped_start(); + test_seal_unmapped_middle(); + test_seal_unmapped_end(); + test_seal_multiple_vmas(); + test_seal_split_start(); + test_seal_split_end(); + test_seal_invalid_input(); + test_seal_zero_length(); + test_seal_twice(); + + test_seal_mprotect(false); + test_seal_mprotect(true); + + test_seal_start_mprotect(false); + test_seal_start_mprotect(true); + + test_seal_end_mprotect(false); + test_seal_end_mprotect(true); + + test_seal_mprotect_unalign_len(false); + test_seal_mprotect_unalign_len(true); + + test_seal_mprotect_unalign_len_variant_2(false); + test_seal_mprotect_unalign_len_variant_2(true); + + test_seal_mprotect_two_vma(false); + test_seal_mprotect_two_vma(true); + + test_seal_mprotect_two_vma_with_split(false); + test_seal_mprotect_two_vma_with_split(true); + + test_seal_mprotect_partial_mprotect(false); + test_seal_mprotect_partial_mprotect(true); + + test_seal_mprotect_two_vma_with_gap(false); + test_seal_mprotect_two_vma_with_gap(true); + + test_seal_mprotect_merge(false); + test_seal_mprotect_merge(true); + + test_seal_mprotect_split(false); + test_seal_mprotect_split(true); + + test_seal_munmap(false); + test_seal_munmap(true); + test_seal_munmap_two_vma(false); + test_seal_munmap_two_vma(true); + test_seal_munmap_vma_with_gap(false); + test_seal_munmap_vma_with_gap(true); + + test_munmap_start_freed(false); + test_munmap_start_freed(true); + test_munmap_middle_freed(false); + test_munmap_middle_freed(true); + test_munmap_end_freed(false); + test_munmap_end_freed(true); + + test_seal_mremap_shrink(false); + test_seal_mremap_shrink(true); + test_seal_mremap_expand(false); + test_seal_mremap_expand(true); + test_seal_mremap_move(false); + test_seal_mremap_move(true); + + test_seal_mremap_shrink_fixed(false); + test_seal_mremap_shrink_fixed(true); + test_seal_mremap_expand_fixed(false); + test_seal_mremap_expand_fixed(true); + test_seal_mremap_move_fixed(false); + test_seal_mremap_move_fixed(true); + test_seal_mremap_move_dontunmap(false); + test_seal_mremap_move_dontunmap(true); + test_seal_mremap_move_fixed_zero(false); + test_seal_mremap_move_fixed_zero(true); + test_seal_mremap_move_dontunmap_anyaddr(false); + test_seal_mremap_move_dontunmap_anyaddr(true); + test_seal_discard_ro_anon(false); + test_seal_discard_ro_anon(true); + test_seal_discard_ro_anon_on_rw(false); + test_seal_discard_ro_anon_on_rw(true); + test_seal_discard_ro_anon_on_shared(false); + test_seal_discard_ro_anon_on_shared(true); + test_seal_discard_ro_anon_on_filebacked(false); + test_seal_discard_ro_anon_on_filebacked(true); + test_seal_mmap_overwrite_prot(false); + test_seal_mmap_overwrite_prot(true); + test_seal_mmap_expand(false); + test_seal_mmap_expand(true); + test_seal_mmap_shrink(false); + test_seal_mmap_shrink(true); + + test_seal_merge_and_split(); + test_seal_zero_address(); + + test_seal_discard_ro_anon_on_pkey(false); + test_seal_discard_ro_anon_on_pkey(true); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c new file mode 100644 index 000000000000..431c1277d83a --- /dev/null +++ b/tools/testing/selftests/mm/on-fault-limit.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <sys/mman.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> +#include "../kselftest.h" + +static void test_limit(void) +{ + struct rlimit lims; + void *map; + + if (getrlimit(RLIMIT_MEMLOCK, &lims)) + ksft_exit_fail_msg("getrlimit: %s\n", strerror(errno)); + + if (mlockall(MCL_ONFAULT | MCL_FUTURE)) + ksft_exit_fail_msg("mlockall: %s\n", strerror(errno)); + + map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + + ksft_test_result(map == MAP_FAILED, "The map failed respecting mlock limits\n"); + + if (map != MAP_FAILED) + munmap(map, 2 * lims.rlim_max); + munlockall(); +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(1); + + if (!getuid()) + ksft_test_result_skip("The test must be run from a normal user\n"); + else + test_limit(); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c new file mode 100644 index 000000000000..2d785aca72a5 --- /dev/null +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -0,0 +1,1664 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <stdio.h> +#include <fcntl.h> +#include <string.h> +#include <sys/mman.h> +#include <errno.h> +#include <malloc.h> +#include "vm_util.h" +#include "../kselftest.h" +#include <linux/types.h> +#include <linux/memfd.h> +#include <linux/userfaultfd.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <math.h> +#include <asm/unistd.h> +#include <pthread.h> +#include <sys/resource.h> +#include <assert.h> +#include <sys/ipc.h> +#include <sys/shm.h> + +#define PAGEMAP_BITS_ALL (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE) +#define PAGEMAP_NON_WRITTEN_BITS (PAGE_IS_WPALLOWED | PAGE_IS_FILE | \ + PAGE_IS_PRESENT | PAGE_IS_SWAPPED | \ + PAGE_IS_PFNZERO | PAGE_IS_HUGE) + +#define TEST_ITERATIONS 100 +#define PAGEMAP "/proc/self/pagemap" +int pagemap_fd; +int uffd; +int page_size; +int hpage_size; +const char *progname; + +#define LEN(region) ((region.end - region.start)/page_size) + +static long pagemap_ioctl(void *start, int len, void *vec, int vec_len, int flag, + int max_pages, long required_mask, long anyof_mask, long excluded_mask, + long return_mask) +{ + struct pm_scan_arg arg; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + len); + arg.vec = (uintptr_t)vec; + arg.vec_len = vec_len; + arg.flags = flag; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = max_pages; + arg.category_mask = required_mask; + arg.category_anyof_mask = anyof_mask; + arg.category_inverted = excluded_mask; + arg.return_mask = return_mask; + + return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); +} + +static long pagemap_ioc(void *start, int len, void *vec, int vec_len, int flag, + int max_pages, long required_mask, long anyof_mask, long excluded_mask, + long return_mask, long *walk_end) +{ + struct pm_scan_arg arg; + int ret; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + len); + arg.vec = (uintptr_t)vec; + arg.vec_len = vec_len; + arg.flags = flag; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = max_pages; + arg.category_mask = required_mask; + arg.category_anyof_mask = anyof_mask; + arg.category_inverted = excluded_mask; + arg.return_mask = return_mask; + + ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); + + if (walk_end) + *walk_end = arg.walk_end; + + return ret; +} + + +int init_uffd(void) +{ + struct uffdio_api uffdio_api; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); + if (uffd == -1) + return uffd; + + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + return -1; + + if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) || + !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) || + !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) || + !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) + return -1; + + return 0; +} + +int wp_init(void *lpBaseAddress, int dwRegionSize) +{ + struct uffdio_register uffdio_register; + struct uffdio_writeprotect wp; + + uffdio_register.range.start = (unsigned long)lpBaseAddress; + uffdio_register.range.len = dwRegionSize; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + ksft_exit_fail_msg("ioctl(UFFDIO_REGISTER) %d %s\n", errno, strerror(errno)); + + if (!(uffdio_register.ioctls & UFFDIO_WRITEPROTECT)) + ksft_exit_fail_msg("ioctl set is incorrect\n"); + + wp.range.start = (unsigned long)lpBaseAddress; + wp.range.len = dwRegionSize; + wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; + + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp)) + ksft_exit_fail_msg("ioctl(UFFDIO_WRITEPROTECT)\n"); + + return 0; +} + +int wp_free(void *lpBaseAddress, int dwRegionSize) +{ + struct uffdio_register uffdio_register; + + uffdio_register.range.start = (unsigned long)lpBaseAddress; + uffdio_register.range.len = dwRegionSize; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + ksft_exit_fail_msg("ioctl unregister failure\n"); + return 0; +} + +int wp_addr_range(void *lpBaseAddress, int dwRegionSize) +{ + if (pagemap_ioctl(lpBaseAddress, dwRegionSize, NULL, 0, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0) + ksft_exit_fail_msg("error %d %d %s\n", 1, errno, strerror(errno)); + + return 0; +} + +void *gethugetlb_mem(int size, int *shmid) +{ + char *mem; + + if (shmid) { + *shmid = shmget(2, size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (*shmid < 0) + return NULL; + + mem = shmat(*shmid, 0, 0); + if (mem == (char *)-1) { + shmctl(*shmid, IPC_RMID, NULL); + ksft_exit_fail_msg("Shared memory attach failure\n"); + } + } else { + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_HUGETLB | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + return NULL; + } + + return mem; +} + +int userfaultfd_tests(void) +{ + int mem_size, vec_size, written, num_pages = 16; + char *mem, *vec; + + mem_size = num_pages * page_size; + mem = mmap(NULL, mem_size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + + /* Change protection of pages differently */ + mprotect(mem, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 1 * mem_size/8, mem_size/8, PROT_READ); + mprotect(mem + 2 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 3 * mem_size/8, mem_size/8, PROT_READ); + mprotect(mem + 4 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 5 * mem_size/8, mem_size/8, PROT_NONE); + mprotect(mem + 6 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 7 * mem_size/8, mem_size/8, PROT_READ); + + wp_addr_range(mem + (mem_size/16), mem_size - 2 * (mem_size/8)); + wp_addr_range(mem, mem_size); + + vec_size = mem_size/page_size; + vec = malloc(sizeof(struct page_region) * vec_size); + + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + free(vec); + return 0; +} + +int get_reads(struct page_region *vec, int vec_size) +{ + int i, sum = 0; + + for (i = 0; i < vec_size; i++) + sum += LEN(vec[i]); + + return sum; +} + +int sanity_tests_sd(void) +{ + int mem_size, vec_size, ret, ret2, ret3, i, num_pages = 1000, total_pages = 0; + int total_writes, total_reads, reads, count; + struct page_region *vec, *vec2; + char *mem, *m[2]; + long walk_end; + + vec_size = num_pages/2; + mem_size = num_pages * page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + vec2 = malloc(sizeof(struct page_region) * vec_size); + if (!vec2) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + /* 1. wrong operation */ + ksft_test_result(pagemap_ioctl(mem, 0, vec, vec_size, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0, + "%s Zero range size is valid\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, NULL, vec_size, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) < 0, + "%s output buffer must be specified with size\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0, + "%s output buffer can be 0\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, 0, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0, + "%s output buffer can be 0\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, -1, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0, + "%s wrong flag specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC | 0xFF, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0, + "%s flag has extra bits specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, 0, 0, 0, PAGE_IS_WRITTEN) >= 0, + "%s no selection mask is specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, PAGE_IS_WRITTEN, 0, 0) == 0, + "%s no return mask is specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, 0x1000) < 0, + "%s wrong return mask specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, 0xFFF, PAGE_IS_WRITTEN, 0, PAGE_IS_WRITTEN) < 0, + "%s mixture of correct and wrong flag\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, 0, 0, PAGEMAP_BITS_ALL, PAGE_IS_WRITTEN) >= 0, + "%s PAGEMAP_BITS_ALL can be specified with PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", + __func__); + + /* 2. Clear area with larger vec size */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + ksft_test_result(ret >= 0, "%s Clear area with larger vec size\n", __func__); + + /* 3. Repeated pattern of written and non-written pages */ + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, + 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == mem_size/(page_size * 2), + "%s Repeated pattern of written and non-written pages\n", __func__); + + /* 4. Repeated pattern of written and non-written pages in parts */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + num_pages/2 - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec, 2, 0, 0, PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ret3 = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret3 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret3, errno, strerror(errno)); + + ksft_test_result((ret + ret3) == num_pages/2 && ret2 == 2, + "%s Repeated pattern of written and non-written pages in parts %d %d %d\n", + __func__, ret, ret3, ret2); + + /* 5. Repeated pattern of written and non-written pages max_pages */ + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + mem[(mem_size/page_size - 1) * page_size]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + num_pages/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result(ret == num_pages/2 && ret2 == 1, + "%s Repeated pattern of written and non-written pages max_pages\n", + __func__); + + /* 6. only get 2 dirty pages and clear them as well */ + vec_size = mem_size/page_size; + memset(mem, -1, mem_size); + + /* get and clear second and third pages */ + ret = pagemap_ioctl(mem + page_size, 2 * page_size, vec, 1, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec2, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == 2 && + vec[0].start == (uintptr_t)(mem + page_size) && + ret2 == 2 && LEN(vec2[0]) == 1 && vec2[0].start == (uintptr_t)mem && + LEN(vec2[1]) == vec_size - 3 && + vec2[1].start == (uintptr_t)(mem + 3 * page_size), + "%s only get 2 written pages and clear them as well\n", __func__); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 7. Two regions */ + m[0] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (m[0] == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + m[1] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (m[1] == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(m[0], mem_size); + wp_init(m[1], mem_size); + wp_addr_range(m[0], mem_size); + wp_addr_range(m[1], mem_size); + + memset(m[0], 'a', mem_size); + memset(m[1], 'b', mem_size); + + wp_addr_range(m[0], mem_size); + + ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == mem_size/page_size, + "%s Two regions\n", __func__); + + wp_free(m[0], mem_size); + wp_free(m[1], mem_size); + munmap(m[0], mem_size); + munmap(m[1], mem_size); + + free(vec); + free(vec2); + + /* 8. Smaller vec */ + mem_size = 1050 * page_size; + vec_size = mem_size/(page_size*2); + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + for (i = 0; i < mem_size/page_size; i += 2) + mem[i * page_size]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + total_pages += ret; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + total_pages += ret; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + total_pages += ret; + + ksft_test_result(total_pages == mem_size/(page_size*2), "%s Smaller max_pages\n", __func__); + + free(vec); + wp_free(mem, mem_size); + munmap(mem, mem_size); + total_pages = 0; + + /* 9. Smaller vec */ + mem_size = 10000 * page_size; + vec_size = 50; + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + for (count = 0; count < TEST_ITERATIONS; count++) { + total_writes = total_reads = 0; + walk_end = (long)mem; + + for (i = 0; i < mem_size; i += page_size) { + if (rand() % 2) { + mem[i]++; + total_writes++; + } + } + + while (total_reads < total_writes) { + ret = pagemap_ioc((void *)walk_end, mem_size-(walk_end - (long)mem), vec, + vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + if (ret > vec_size) + break; + + reads = get_reads(vec, ret); + total_reads += reads; + } + + if (total_reads != total_writes) + break; + } + + ksft_test_result(count == TEST_ITERATIONS, "Smaller vec\n"); + + free(vec); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 10. Walk_end tester */ + vec_size = 1000; + mem_size = vec_size * page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + memset(mem, 0, mem_size); + + ret = pagemap_ioc(mem, 0, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 0 && walk_end == (long)mem, + "Walk_end: Same start and end address\n"); + + ret = pagemap_ioc(mem, 0, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 0 && walk_end == (long)mem, + "Walk_end: Same start and end with WP\n"); + + ret = pagemap_ioc(mem, 0, vec, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 0 && walk_end == (long)mem, + "Walk_end: Same start and end with 0 output buffer\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: Big vec\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: vec of minimum length\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size/2), + "Walk_end: Half max pages\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size), + "Walk_end: 1 max page\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + -1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: max pages\n"); + + wp_addr_range(mem, mem_size); + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_end sparse: Big vec\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), + "Walk_end sparse: vec of minimum length\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), + "Walk_end sparse: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size/2, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_end sparse: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_end sparse: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_endsparse : Half max pages\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), + "Walk_end: 1 max page\n"); + + free(vec); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + return 0; +} + +int base_tests(char *prefix, char *mem, int mem_size, int skip) +{ + int vec_size, written; + struct page_region *vec, *vec2; + + if (skip) { + ksft_test_result_skip("%s all new pages must not be written (dirty)\n", prefix); + ksft_test_result_skip("%s all pages must be written (dirty)\n", prefix); + ksft_test_result_skip("%s all pages dirty other than first and the last one\n", + prefix); + ksft_test_result_skip("%s PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", prefix); + ksft_test_result_skip("%s only middle page dirty\n", prefix); + ksft_test_result_skip("%s only two middle pages dirty\n", prefix); + return 0; + } + + vec_size = mem_size/page_size; + vec = malloc(sizeof(struct page_region) * vec_size); + vec2 = malloc(sizeof(struct page_region) * vec_size); + + /* 1. all new pages must be not be written (dirty) */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", prefix); + + /* 2. all pages must be written */ + memset(mem, -1, mem_size); + + written = pagemap_ioctl(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && LEN(vec[0]) == mem_size/page_size, + "%s all pages must be written (dirty)\n", prefix); + + /* 3. all pages dirty other than first and the last one */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + memset(mem + page_size, 0, mem_size - (2 * page_size)); + + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && LEN(vec[0]) >= vec_size - 2 && LEN(vec[0]) <= vec_size, + "%s all pages dirty other than first and the last one\n", prefix); + + written = pagemap_ioctl(mem, mem_size, vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 0, + "%s PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", prefix); + + /* 4. only middle page dirty */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + mem[vec_size/2 * page_size]++; + + written = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && LEN(vec[0]) >= 1, + "%s only middle page dirty\n", prefix); + + /* 5. only two middle pages dirty and walk over only middle pages */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN | PAGE_IS_HUGE); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + mem[vec_size/2 * page_size]++; + mem[(vec_size/2 + 1) * page_size]++; + + written = pagemap_ioctl(&mem[vec_size/2 * page_size], 2 * page_size, vec, 1, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN | PAGE_IS_HUGE); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && vec[0].start == (uintptr_t)(&mem[vec_size/2 * page_size]) + && LEN(vec[0]) == 2, + "%s only two middle pages dirty\n", prefix); + + free(vec); + free(vec2); + return 0; +} + +void *gethugepage(int map_size) +{ + int ret; + char *map; + + map = memalign(hpage_size, map_size); + if (!map) + ksft_exit_fail_msg("memalign failed %d %s\n", errno, strerror(errno)); + + ret = madvise(map, map_size, MADV_HUGEPAGE); + if (ret) + return NULL; + + memset(map, 0, map_size); + + return map; +} + +int hpage_unit_tests(void) +{ + char *map; + int ret, ret2; + size_t num_pages = 10; + int map_size = hpage_size * num_pages; + int vec_size = map_size/page_size; + struct page_region *vec, *vec2; + + vec = malloc(sizeof(struct page_region) * vec_size); + vec2 = malloc(sizeof(struct page_region) * vec_size); + if (!vec || !vec2) + ksft_exit_fail_msg("malloc failed\n"); + + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + /* 1. all new huge page must not be written (dirty) */ + ret = pagemap_ioctl(map, map_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 0, "%s all new huge page must not be written (dirty)\n", + __func__); + + /* 2. all the huge page must not be written */ + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 0, "%s all the huge page must not be written\n", __func__); + + /* 3. all the huge page must be written and clear dirty as well */ + memset(map, -1, map_size); + ret = pagemap_ioctl(map, map_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].start == (uintptr_t)map && + LEN(vec[0]) == vec_size && vec[0].categories == PAGE_IS_WRITTEN, + "%s all the huge page must be written and clear\n", __func__); + + /* 4. only middle page written */ + wp_free(map, map_size); + free(map); + map = gethugepage(map_size); + wp_init(map, map_size); + wp_addr_range(map, map_size); + map[vec_size/2 * page_size]++; + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) > 0, + "%s only middle page written\n", __func__); + + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s all new huge page must be written\n", __func__); + ksft_test_result_skip("%s all the huge page must not be written\n", __func__); + ksft_test_result_skip("%s all the huge page must be written and clear\n", __func__); + ksft_test_result_skip("%s only middle page written\n", __func__); + } + + /* 5. clear first half of huge page */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, 0, map_size); + + wp_addr_range(map, map_size/2); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2 && + vec[0].start == (uintptr_t)(map + map_size/2), + "%s clear first half of huge page\n", __func__); + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s clear first half of huge page\n", __func__); + } + + /* 6. clear first half of huge page with limited buffer */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, 0, map_size); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2 && + vec[0].start == (uintptr_t)(map + map_size/2), + "%s clear first half of huge page with limited buffer\n", + __func__); + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s clear first half of huge page with limited buffer\n", + __func__); + } + + /* 7. clear second half of huge page */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, -1, map_size); + + ret = pagemap_ioctl(map + map_size/2, map_size/2, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size/2, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2, + "%s clear second half huge page\n", __func__); + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s clear second half huge page\n", __func__); + } + + /* 8. get half huge page */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, -1, map_size); + usleep(100); + + ret = pagemap_ioctl(map, map_size, vec, 1, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + hpage_size/(2*page_size), PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == hpage_size/(2*page_size), + "%s get half huge page\n", __func__); + + ret2 = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result(ret2 == 1 && LEN(vec[0]) == (map_size - hpage_size/2)/page_size, + "%s get half huge page\n", __func__); + + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s get half huge page\n", __func__); + ksft_test_result_skip("%s get half huge page\n", __func__); + } + + free(vec); + free(vec2); + return 0; +} + +int unmapped_region_tests(void) +{ + void *start = (void *)0x10000000; + int written, len = 0x00040000; + int vec_size = len / page_size; + struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); + + /* 1. Get written pages */ + written = pagemap_ioctl(start, len, vec, vec_size, 0, 0, + PAGEMAP_NON_WRITTEN_BITS, 0, 0, PAGEMAP_NON_WRITTEN_BITS); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written >= 0, "%s Get status of pages\n", __func__); + + free(vec); + return 0; +} + +static void test_simple(void) +{ + int i; + char *map; + struct page_region vec; + + map = aligned_alloc(page_size, page_size); + if (!map) + ksft_exit_fail_msg("aligned_alloc failed\n"); + + wp_init(map, page_size); + wp_addr_range(map, page_size); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 1) { + ksft_print_msg("written bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + wp_addr_range(map, page_size); + /* Write something to the page to get the written bit enabled on the page */ + map[0]++; + + if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 0) { + ksft_print_msg("written bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + wp_addr_range(map, page_size); + } + wp_free(map, page_size); + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +int sanity_tests(void) +{ + int mem_size, vec_size, ret, fd, i, buf_size; + struct page_region *vec; + char *mem, *fmem; + struct stat sbuf; + char *tmp_buf; + + /* 1. wrong operation */ + mem_size = 10 * page_size; + vec_size = mem_size / page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED || vec == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) >= 0, + "%s WP op can be specified with !PAGE_IS_WRITTEN\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) >= 0, + "%s required_mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, PAGEMAP_BITS_ALL) >= 0, + "%s anyof_mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, 0, PAGEMAP_BITS_ALL, PAGEMAP_BITS_ALL) >= 0, + "%s excluded_mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_BITS_ALL, PAGEMAP_BITS_ALL, 0, + PAGEMAP_BITS_ALL) >= 0, + "%s required_mask and anyof_mask specified\n", __func__); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 2. Get sd and present pages with anyof_mask */ + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + memset(mem, 0, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, PAGEMAP_BITS_ALL); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) == + (PAGE_IS_WRITTEN | PAGE_IS_PRESENT), + "%s Get sd and present pages with anyof_mask\n", __func__); + + /* 3. Get sd and present pages with required_mask */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) == + (PAGE_IS_WRITTEN | PAGE_IS_PRESENT), + "%s Get all the pages with required_mask\n", __func__); + + /* 4. Get sd and present pages with required_mask and anyof_mask */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, PAGE_IS_PRESENT, 0, PAGEMAP_BITS_ALL); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) == + (PAGE_IS_WRITTEN | PAGE_IS_PRESENT), + "%s Get sd and present pages with required_mask and anyof_mask\n", + __func__); + + /* 5. Don't get sd pages */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, PAGE_IS_WRITTEN, PAGEMAP_BITS_ALL); + ksft_test_result(ret == 0, "%s Don't get sd pages\n", __func__); + + /* 6. Don't get present pages */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_PRESENT, 0, PAGE_IS_PRESENT, PAGEMAP_BITS_ALL); + ksft_test_result(ret == 0, "%s Don't get present pages\n", __func__); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 8. Find written present pages with return mask */ + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + memset(mem, 0, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + 0, PAGEMAP_BITS_ALL, 0, PAGE_IS_WRITTEN); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + vec[0].categories == PAGE_IS_WRITTEN, + "%s Find written present pages with return mask\n", __func__); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 9. Memory mapped file */ + fd = open(progname, O_RDONLY); + if (fd < 0) + ksft_exit_fail_msg("%s Memory mapped file\n", __func__); + + ret = stat(progname, &sbuf); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); + + tmp_buf = malloc(sbuf.st_size); + memcpy(tmp_buf, fmem, sbuf.st_size); + + ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, + 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS); + + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && + LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) && + (vec[0].categories & PAGE_IS_FILE), + "%s Memory mapped file\n", __func__); + + munmap(fmem, sbuf.st_size); + close(fd); + + /* 10. Create and read/write to a memory mapped file */ + buf_size = page_size * 10; + + fd = open(__FILE__".tmp2", O_RDWR | O_CREAT, 0666); + if (fd < 0) + ksft_exit_fail_msg("Read/write to memory: %s\n", + strerror(errno)); + + for (i = 0; i < buf_size; i++) + if (write(fd, "c", 1) < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file\n"); + + fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); + + wp_init(fmem, buf_size); + wp_addr_range(fmem, buf_size); + + for (i = 0; i < buf_size; i++) + fmem[i] = 'z'; + + msync(fmem, buf_size, MS_SYNC); + + ret = pagemap_ioctl(fmem, buf_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_FILE, 0, + PAGEMAP_BITS_ALL); + + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && + LEN(vec[0]) == (buf_size/page_size) && + (vec[0].categories & PAGE_IS_WRITTEN), + "%s Read/write to memory\n", __func__); + + wp_free(fmem, buf_size); + munmap(fmem, buf_size); + close(fd); + + free(vec); + return 0; +} + +int mprotect_tests(void) +{ + int ret; + char *mem, *mem2; + struct page_region vec; + int pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + + if (pagemap_fd < 0) { + fprintf(stderr, "open() failed\n"); + exit(1); + } + + /* 1. Map two pages */ + mem = mmap(0, 2 * page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, 2 * page_size); + wp_addr_range(mem, 2 * page_size); + + /* Populate both pages. */ + memset(mem, 1, 2 * page_size); + + ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec) == 2, "%s Both pages written\n", __func__); + + /* 2. Start tracking */ + wp_addr_range(mem, 2 * page_size); + + ksft_test_result(pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 0, + "%s Both pages are not written (dirty)\n", __func__); + + /* 3. Remap the second page */ + mem2 = mmap(mem + page_size, page_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON|MAP_FIXED, -1, 0); + if (mem2 == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem2, page_size); + wp_addr_range(mem2, page_size); + + /* Protect + unprotect. */ + mprotect(mem, page_size, PROT_NONE); + mprotect(mem, 2 * page_size, PROT_READ); + mprotect(mem, 2 * page_size, PROT_READ|PROT_WRITE); + + /* Modify both pages. */ + memset(mem, 2, 2 * page_size); + + /* Protect + unprotect. */ + mprotect(mem, page_size, PROT_NONE); + mprotect(mem, page_size, PROT_READ); + mprotect(mem, page_size, PROT_READ|PROT_WRITE); + + ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec) == 2, + "%s Both pages written after remap and mprotect\n", __func__); + + /* 4. Clear and make the pages written */ + wp_addr_range(mem, 2 * page_size); + + memset(mem, 'A', 2 * page_size); + + ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec) == 2, + "%s Clear and make the pages written\n", __func__); + + wp_free(mem, 2 * page_size); + munmap(mem, 2 * page_size); + return 0; +} + +/* transact test */ +static const unsigned int nthreads = 6, pages_per_thread = 32, access_per_thread = 8; +static pthread_barrier_t start_barrier, end_barrier; +static unsigned int extra_thread_faults; +static unsigned int iter_count = 1000; +static volatile int finish; + +static ssize_t get_dirty_pages_reset(char *mem, unsigned int count, + int reset, int page_size) +{ + struct pm_scan_arg arg = {0}; + struct page_region rgns[256]; + int i, j, cnt, ret; + + arg.size = sizeof(struct pm_scan_arg); + arg.start = (uintptr_t)mem; + arg.max_pages = count; + arg.end = (uintptr_t)(mem + count * page_size); + arg.vec = (uintptr_t)rgns; + arg.vec_len = sizeof(rgns) / sizeof(*rgns); + if (reset) + arg.flags |= PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC; + arg.category_mask = PAGE_IS_WRITTEN; + arg.return_mask = PAGE_IS_WRITTEN; + + ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); + if (ret < 0) + ksft_exit_fail_msg("ioctl failed\n"); + + cnt = 0; + for (i = 0; i < ret; ++i) { + if (rgns[i].categories != PAGE_IS_WRITTEN) + ksft_exit_fail_msg("wrong flags\n"); + + for (j = 0; j < LEN(rgns[i]); ++j) + cnt++; + } + + return cnt; +} + +void *thread_proc(void *mem) +{ + int *m = mem; + long curr_faults, faults; + struct rusage r; + unsigned int i; + int ret; + + if (getrusage(RUSAGE_THREAD, &r)) + ksft_exit_fail_msg("getrusage\n"); + + curr_faults = r.ru_minflt; + + while (!finish) { + ret = pthread_barrier_wait(&start_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + for (i = 0; i < access_per_thread; ++i) + __atomic_add_fetch(m + i * (0x1000 / sizeof(*m)), 1, __ATOMIC_SEQ_CST); + + ret = pthread_barrier_wait(&end_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + if (getrusage(RUSAGE_THREAD, &r)) + ksft_exit_fail_msg("getrusage\n"); + + faults = r.ru_minflt - curr_faults; + if (faults < access_per_thread) + ksft_exit_fail_msg("faults < access_per_thread"); + + __atomic_add_fetch(&extra_thread_faults, faults - access_per_thread, + __ATOMIC_SEQ_CST); + curr_faults = r.ru_minflt; + } + + return NULL; +} + +static void transact_test(int page_size) +{ + unsigned int i, count, extra_pages; + pthread_t th; + char *mem; + int ret, c; + + if (pthread_barrier_init(&start_barrier, NULL, nthreads + 1)) + ksft_exit_fail_msg("pthread_barrier_init\n"); + + if (pthread_barrier_init(&end_barrier, NULL, nthreads + 1)) + ksft_exit_fail_msg("pthread_barrier_init\n"); + + mem = mmap(NULL, 0x1000 * nthreads * pages_per_thread, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("Error mmap %s.\n", strerror(errno)); + + wp_init(mem, 0x1000 * nthreads * pages_per_thread); + wp_addr_range(mem, 0x1000 * nthreads * pages_per_thread); + + memset(mem, 0, 0x1000 * nthreads * pages_per_thread); + + count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); + ksft_test_result(count > 0, "%s count %d\n", __func__, count); + count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); + ksft_test_result(count == 0, "%s count %d\n", __func__, count); + + finish = 0; + for (i = 0; i < nthreads; ++i) + pthread_create(&th, NULL, thread_proc, mem + 0x1000 * i * pages_per_thread); + + extra_pages = 0; + for (i = 0; i < iter_count; ++i) { + count = 0; + + ret = pthread_barrier_wait(&start_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, + page_size); + + ret = pthread_barrier_wait(&end_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + if (count > nthreads * access_per_thread) + ksft_exit_fail_msg("Too big count %d expected %d, iter %d\n", + count, nthreads * access_per_thread, i); + + c = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); + count += c; + + if (c > nthreads * access_per_thread) { + ksft_test_result_fail(" %s count > nthreads\n", __func__); + return; + } + + if (count != nthreads * access_per_thread) { + /* + * The purpose of the test is to make sure that no page updates are lost + * when the page updates and read-resetting soft dirty flags are performed + * in parallel. However, it is possible that the application will get the + * soft dirty flags twice on the two consecutive read-resets. This seems + * unavoidable as soft dirty flag is handled in software through page faults + * in kernel. While the updating the flags is supposed to be synchronized + * between page fault handling and read-reset, it is possible that + * read-reset happens after page fault PTE update but before the application + * re-executes write instruction. So read-reset gets the flag, clears write + * access and application gets page fault again for the same write. + */ + if (count < nthreads * access_per_thread) { + ksft_test_result_fail("Lost update, iter %d, %d vs %d.\n", i, count, + nthreads * access_per_thread); + return; + } + + extra_pages += count - nthreads * access_per_thread; + } + } + + pthread_barrier_wait(&start_barrier); + finish = 1; + pthread_barrier_wait(&end_barrier); + + ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %d.\n", __func__, + extra_pages, + 100.0 * extra_pages / (iter_count * nthreads * access_per_thread), + extra_thread_faults); +} + +int main(int argc, char *argv[]) +{ + int mem_size, shmid, buf_size, fd, i, ret; + char *mem, *map, *fmem; + struct stat sbuf; + + progname = argv[0]; + + ksft_print_header(); + + if (init_uffd()) + ksft_exit_pass(); + + ksft_set_plan(115); + + page_size = getpagesize(); + hpage_size = read_pmd_pagesize(); + + pagemap_fd = open(PAGEMAP, O_RDONLY); + if (pagemap_fd < 0) + return -EINVAL; + + /* 1. Sanity testing */ + sanity_tests_sd(); + + /* 2. Normal page testing */ + mem_size = 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Page testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 3. Large page testing */ + mem_size = 512 * 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Large Page testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 4. Huge page testing */ + map = gethugepage(hpage_size); + if (map) { + wp_init(map, hpage_size); + wp_addr_range(map, hpage_size); + base_tests("Huge page testing:", map, hpage_size, 0); + wp_free(map, hpage_size); + free(map); + } else { + base_tests("Huge page testing:", NULL, 0, 1); + } + + /* 5. SHM Hugetlb page testing */ + mem_size = 2*1024*1024; + mem = gethugetlb_mem(mem_size, &shmid); + if (mem) { + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Hugetlb shmem testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + shmctl(shmid, IPC_RMID, NULL); + } else { + base_tests("Hugetlb shmem testing:", NULL, 0, 1); + } + + /* 6. Hugetlb page testing */ + mem = gethugetlb_mem(mem_size, NULL); + if (mem) { + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Hugetlb mem testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + } else { + base_tests("Hugetlb mem testing:", NULL, 0, 1); + } + + /* 7. File Hugetlb testing */ + mem_size = 2*1024*1024; + fd = memfd_create("uffd-test", MFD_HUGETLB | MFD_NOEXEC_SEAL); + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem) { + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Hugetlb shmem testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + shmctl(shmid, IPC_RMID, NULL); + } else { + base_tests("Hugetlb shmem testing:", NULL, 0, 1); + } + close(fd); + + /* 8. File memory testing */ + buf_size = page_size * 10; + + fd = open(__FILE__".tmp0", O_RDWR | O_CREAT, 0777); + if (fd < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file: %s\n", + strerror(errno)); + + for (i = 0; i < buf_size; i++) + if (write(fd, "c", 1) < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file\n"); + + ret = stat(__FILE__".tmp0", &sbuf); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); + + wp_init(fmem, sbuf.st_size); + wp_addr_range(fmem, sbuf.st_size); + + base_tests("File memory testing:", fmem, sbuf.st_size, 0); + + wp_free(fmem, sbuf.st_size); + munmap(fmem, sbuf.st_size); + close(fd); + + /* 9. File memory testing */ + buf_size = page_size * 10; + + fd = memfd_create(__FILE__".tmp00", MFD_NOEXEC_SEAL); + if (fd < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file: %s\n", + strerror(errno)); + + if (ftruncate(fd, buf_size)) + ksft_exit_fail_msg("Error ftruncate\n"); + + for (i = 0; i < buf_size; i++) + if (write(fd, "c", 1) < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file\n"); + + fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); + + wp_init(fmem, buf_size); + wp_addr_range(fmem, buf_size); + + base_tests("File anonymous memory testing:", fmem, buf_size, 0); + + wp_free(fmem, buf_size); + munmap(fmem, buf_size); + close(fd); + + /* 10. Huge page tests */ + hpage_unit_tests(); + + /* 11. Iterative test */ + test_simple(); + + /* 12. Mprotect test */ + mprotect_tests(); + + /* 13. Transact test */ + transact_test(page_size); + + /* 14. Sanity testing */ + sanity_tests(); + + /*15. Unmapped address test */ + unmapped_region_tests(); + + /* 16. Userfaultfd tests */ + userfaultfd_tests(); + + close(pagemap_fd); + ksft_exit_pass(); +} diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 622a85848f61..1af3156a9db8 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -13,6 +13,8 @@ #include <ucontext.h> #include <sys/mman.h> +#include "../kselftest.h" + /* Define some kernel-like types */ #define u8 __u8 #define u16 __u16 @@ -32,7 +34,7 @@ extern int test_nr; extern int iteration_nr; #ifdef __GNUC__ -__attribute__((format(printf, 1, 2))) +__printf(1, 2) #endif static inline void sigsafe_printf(const char *format, ...) { @@ -175,7 +177,6 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write) dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); } -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) #define ALIGN_PTR_UP(p, ptr_align_to) \ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h index 1ebb586b2fbc..ae5df26104e5 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/mm/pkey-powerpc.h @@ -3,9 +3,6 @@ #ifndef _PKEYS_POWERPC_H #define _PKEYS_POWERPC_H -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 386 -#endif #ifndef SYS_pkey_alloc # define SYS_pkey_alloc 384 # define SYS_pkey_free 385 diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index 3be20f5d5275..814758e109c0 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -5,29 +5,11 @@ #ifdef __i386__ -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif - #define REG_IP_IDX REG_EIP #define si_pkey_offset 0x14 #else -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif - #define REG_IP_IDX REG_RIP #define si_pkey_offset 0x20 @@ -80,19 +62,6 @@ static inline void __write_pkey_reg(u64 pkey_reg) assert(pkey_reg == __read_pkey_reg()); } -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ #define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ @@ -104,9 +73,7 @@ static inline int cpu_has_pkeys(void) unsigned int ecx; unsigned int edx; - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); + __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); if (!(ecx & X86_FEATURE_PKU)) { dprintf2("cpu does not have PKU\n"); @@ -119,6 +86,18 @@ static inline int cpu_has_pkeys(void) return 1; } +static inline int cpu_max_xsave_size(void) +{ + unsigned long XSTATE_CPUID = 0xd; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); + return ecx; +} + static inline u32 pkey_bit_position(int pkey) { return pkey * PKEY_BITS_PER_PKEY; @@ -126,6 +105,7 @@ static inline u32 pkey_bit_position(int pkey) #define XSTATE_PKEY_BIT (9) #define XSTATE_PKEY 0x200 +#define XSTATE_BV_OFFSET 512 int pkey_reg_xstate_offset(void) { @@ -134,16 +114,14 @@ int pkey_reg_xstate_offset(void) unsigned int ecx; unsigned int edx; int xstate_offset; - int xstate_size; + int xstate_size = 0; unsigned long XSTATE_CPUID = 0xd; int leaf; /* assume that XSTATE_PKEY is set in XCR0 */ leaf = XSTATE_PKEY_BIT; { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); + __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); if (leaf == XSTATE_PKEY_BIT) { xstate_offset = ebx; diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index fdbb602ecf32..48dc151f8fca 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -18,12 +18,13 @@ * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks * * Compile like this: - * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm */ #define _GNU_SOURCE #define __SANE_USERSPACE_TYPES__ #include <errno.h> +#include <linux/elf.h> #include <linux/futex.h> #include <time.h> #include <sys/time.h> @@ -97,7 +98,7 @@ int tracing_root_ok(void) void tracing_on(void) { #if CONTROL_TRACING > 0 -#define TRACEDIR "/sys/kernel/debug/tracing" +#define TRACEDIR "/sys/kernel/tracing" char pidstr[32]; if (!tracing_root_ok()) @@ -123,7 +124,7 @@ void tracing_off(void) #if CONTROL_TRACING > 0 if (!tracing_root_ok()) return; - cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); + cat_into_file("0", "/sys/kernel/tracing/tracing_on"); #endif } @@ -293,15 +294,6 @@ void pkey_access_deny(int pkey) pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } -/* Failed address bound checks: */ -#ifndef SEGV_BNDERR -# define SEGV_BNDERR 3 -#endif - -#ifndef SEGV_PKUERR -# define SEGV_PKUERR 4 -#endif - static char *si_code_str(int si_code) { if (si_code == SEGV_MAPERR) @@ -475,7 +467,7 @@ int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, ptr, size, orig_prot, pkey); errno = 0; - sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); + sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey); if (errno) { dprintf2("SYS_mprotect_key sret: %d\n", sret); dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); @@ -510,7 +502,7 @@ int alloc_pkey(void) " shadow: 0x%016llx\n", __func__, __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - if (ret) { + if (ret > 0) { /* clear both the bits: */ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, ~PKEY_MASK); @@ -561,7 +553,6 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); - srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; @@ -1278,6 +1269,78 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) } } +void arch_force_pkey_reg_init(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u64 *buf; + + /* + * All keys should be allocated and set to allow reads and + * writes, so the register should be all 0. If not, just + * skip the test. + */ + if (read_pkey_reg()) + return; + + /* + * Just allocate an absurd about of memory rather than + * doing the XSAVE size enumeration dance. + */ + buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + + /* These __builtins require compiling with -mxsave */ + + /* XSAVE to build a valid buffer: */ + __builtin_ia32_xsave(buf, XSTATE_PKEY); + /* Clear XSTATE_BV[PKRU]: */ + buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; + /* XRSTOR will likely get PKRU back to the init state: */ + __builtin_ia32_xrstor(buf, XSTATE_PKEY); + + munmap(buf, 1*MB); +#endif +} + + +/* + * This is mostly useless on ppc for now. But it will not + * hurt anything and should give some better coverage as + * a long-running test that continually checks the pkey + * register. + */ +void test_pkey_init_state(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS; i++) { + int new_pkey = alloc_pkey(); + + if (new_pkey < 0) + continue; + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + arch_force_pkey_reg_init(); + + /* + * Loop for a bit, hoping to get exercise the kernel + * context switch code. + */ + for (i = 0; i < 1000000; i++) + read_pkey_reg(); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + /* * pkey 0 is special. It is allocated by default, so you do not * have to call pkey_alloc() to use it first. Make sure that it @@ -1449,6 +1512,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); + /* + * Reset the shadow, assuming that the above mprotect() + * correctly changed PKRU, but to an unknown value since + * the actual allocated pkey is unknown. + */ + shadow_pkey_reg = __read_pkey_reg(); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ @@ -1472,6 +1542,129 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } +#if defined(__i386__) || defined(__x86_64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + u32 new_pkru; + pid_t child; + int status, ret; + int pkey_offset = pkey_reg_xstate_offset(); + size_t xsave_size = cpu_max_xsave_size(); + void *xsave; + u32 *pkey_register; + u64 *xstate_bv; + struct iovec iov; + + new_pkru = ~read_pkey_reg(); + /* Don't make PROT_EXEC mappings inaccessible */ + new_pkru &= ~3; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkru) + exit(1); + + /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ + raise(SIGSTOP); + + if (__read_pkey_reg() != 0) + exit(1); + + /* Stop and allow the tracer to examine PKRU */ + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + xsave = (void *)malloc(xsave_size); + pkey_assert(xsave > 0); + + /* Modify the PKRU register directly */ + iov.iov_base = xsave; + iov.iov_len = xsave_size; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + pkey_register = (u32 *)(xsave + pkey_offset); + pkey_assert(*pkey_register == read_pkey_reg()); + + *pkey_register = new_pkru; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Clear the PKRU bit from XSTATE_BV */ + xstate_bv = (u64 *)(xsave + 512); + *xstate_bv &= ~(1 << 9); + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value go to 0 */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); + free(xsave); +} +#endif + void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) { int size = PAGE_SIZE; @@ -1482,7 +1675,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) return; } - sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); + sret = syscall(__NR_pkey_mprotect, ptr, size, PROT_READ, pkey); pkey_assert(sret < 0); } @@ -1502,10 +1695,14 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_implicit_mprotect_exec_only_memory, test_mprotect_with_pkey_0, test_ptrace_of_child, + test_pkey_init_state, test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, test_pkey_alloc_free_attach_pkey0, +#if defined(__i386__) || defined(__x86_64__) + test_ptrace_modifies_pkru, +#endif }; void run_tests_once(void) @@ -1552,6 +1749,8 @@ int main(void) int nr_iterations = 22; int pkeys_supported = is_pkeys_supported(); + srand((unsigned int)time(NULL)); + setup_handlers(); printf("has pkeys: %d\n", pkeys_supported); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh new file mode 100755 index 000000000000..3157204b9047 --- /dev/null +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -0,0 +1,452 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Please run as root + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +count_total=0 +count_pass=0 +count_fail=0 +count_skip=0 +exitcode=0 + +usage() { + cat <<EOF +usage: ${BASH_SOURCE[0]:-$0} [ options ] + + -a: run all tests, including extra ones (other than destructive ones) + -t: specify specific categories to tests to run + -h: display this message + -n: disable TAP output + -d: run destructive tests + +The default behavior is to run required tests only. If -a is specified, +will run all tests. + +Alternatively, specific groups tests can be run by passing a string +to the -t argument containing one or more of the following categories +separated by spaces: +- mmap + tests for mmap(2) +- gup_test + tests for gup +- userfaultfd + tests for userfaultfd(2) +- compaction + a test for the patch "Allow compaction of unevictable pages" +- mlock + tests for mlock(2) +- mremap + tests for mremap(2) +- hugevm + tests for very large virtual address space +- vmalloc + vmalloc smoke tests +- hmm + hmm smoke tests +- madv_populate + test memadvise(2) MADV_POPULATE_{READ,WRITE} options +- memfd_secret + test memfd_secret(2) +- process_mrelease + test process_mrelease(2) +- ksm + ksm tests that do not require >=2 NUMA nodes +- ksm_numa + ksm tests that require >=2 NUMA nodes +- pkey + memory protection key tests +- soft_dirty + test soft dirty page bit semantics +- pagemap + test pagemap_scan IOCTL +- cow + test copy-on-write semantics +- thp + test transparent huge pages +- hugetlb + test hugetlbfs huge pages +- migration + invoke move_pages(2) to exercise the migration entry code + paths in the kernel +- mkdirty + test handling of code that might set PTE/PMD dirty in + read-only VMAs +- mdwe + test prctl(PR_SET_MDWE, ...) + +example: ./run_vmtests.sh -t "hmm mmap ksm" +EOF + exit 0 +} + +RUN_ALL=false +RUN_DESTRUCTIVE=false +TAP_PREFIX="# " + +while getopts "aht:n" OPT; do + case ${OPT} in + "a") RUN_ALL=true ;; + "h") usage ;; + "t") VM_SELFTEST_ITEMS=${OPTARG} ;; + "n") TAP_PREFIX= ;; + "d") RUN_DESTRUCTIVE=true ;; + esac +done +shift $((OPTIND -1)) + +# default behavior: run all tests +VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} + +test_selected() { + if [ "$VM_SELFTEST_ITEMS" == "default" ]; then + # If no VM_SELFTEST_ITEMS are specified, run all tests + return 0 + fi + # If test selected argument is one of the test items + if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then + return 0 + else + return 1 + fi +} + +run_gup_matrix() { + # -t: thp=on, -T: thp=off, -H: hugetlb=on + local hugetlb_mb=$(( needmem_KB / 1024 )) + + for huge in -t -T "-H -m $hugetlb_mb"; do + # -u: gup-fast, -U: gup-basic, -a: pin-fast, -b: pin-basic, -L: pin-longterm + for test_cmd in -u -U -a -b -L; do + # -w: write=1, -W: write=0 + for write in -w -W; do + # -S: shared + for share in -S " "; do + # -n: How many pages to fetch together? 512 is special + # because it's default thp size (or 2M on x86), 123 to + # just test partial gup when hit a huge in whatever form + for num in "-n 1" "-n 512" "-n 123"; do + CATEGORY="gup_test" run_test ./gup_test \ + $huge $test_cmd $write $share $num + done + done + done + done + done +} + +# get huge pagesize and freepages from /proc/meminfo +while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs="$size" + fi + if [ "$name" = "Hugepagesize:" ]; then + hpgsize_KB="$size" + fi +done < /proc/meminfo + +# Simple hugetlbfs tests have a hardcoded minimum requirement of +# huge pages totaling 256MB (262144KB) in size. The userfaultfd +# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take +# both of these requirements into account and attempt to increase +# number of huge pages available. +nr_cpus=$(nproc) +uffd_min_KB=$((hpgsize_KB * nr_cpus * 2)) +hugetlb_min_KB=$((256 * 1024)) +if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then + needmem_KB=$uffd_min_KB +else + needmem_KB=$hugetlb_min_KB +fi + +# set proper nr_hugepages +if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + needpgs=$((needmem_KB / hpgsize_KB)) + tries=2 + while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do + lackpgs=$((needpgs - freepgs)) + echo 3 > /proc/sys/vm/drop_caches + if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then + echo "Please run this test as root" + exit $ksft_skip + fi + while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs=$size + fi + done < /proc/meminfo + tries=$((tries - 1)) + done + if [ "$freepgs" -lt "$needpgs" ]; then + printf "Not enough huge pages available (%d < %d)\n" \ + "$freepgs" "$needpgs" + fi +else + echo "no hugetlbfs support in kernel?" + exit 1 +fi + +# filter 64bit architectures +ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64" +if [ -z "$ARCH" ]; then + ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') +fi +VADDR64=0 +echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 + +tap_prefix() { + sed -e "s/^/${TAP_PREFIX}/" +} + +tap_output() { + if [[ ! -z "$TAP_PREFIX" ]]; then + read str + echo $str + fi +} + +pretty_name() { + echo "$*" | sed -e 's/^\(bash \)\?\.\///' +} + +# Usage: run_test [test binary] [arbitrary test arguments...] +run_test() { + if test_selected ${CATEGORY}; then + # On memory constrainted systems some tests can fail to allocate hugepages. + # perform some cleanup before the test for a higher success rate. + if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then + echo 3 > /proc/sys/vm/drop_caches + sleep 2 + echo 1 > /proc/sys/vm/compact_memory + sleep 2 + fi + + local test=$(pretty_name "$*") + local title="running $*" + local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) + printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" | tap_prefix + + ("$@" 2>&1) | tap_prefix + local ret=${PIPESTATUS[0]} + count_total=$(( count_total + 1 )) + if [ $ret -eq 0 ]; then + count_pass=$(( count_pass + 1 )) + echo "[PASS]" | tap_prefix + echo "ok ${count_total} ${test}" | tap_output + elif [ $ret -eq $ksft_skip ]; then + count_skip=$(( count_skip + 1 )) + echo "[SKIP]" | tap_prefix + echo "ok ${count_total} ${test} # SKIP" | tap_output + exitcode=$ksft_skip + else + count_fail=$(( count_fail + 1 )) + echo "[FAIL]" | tap_prefix + echo "not ok ${count_total} ${test} # exit=$ret" | tap_output + exitcode=1 + fi + fi # test_selected +} + +echo "TAP version 13" | tap_output + +CATEGORY="hugetlb" run_test ./hugepage-mmap + +shmmax=$(cat /proc/sys/kernel/shmmax) +shmall=$(cat /proc/sys/kernel/shmall) +echo 268435456 > /proc/sys/kernel/shmmax +echo 4194304 > /proc/sys/kernel/shmall +CATEGORY="hugetlb" run_test ./hugepage-shm +echo "$shmmax" > /proc/sys/kernel/shmmax +echo "$shmall" > /proc/sys/kernel/shmall + +CATEGORY="hugetlb" run_test ./map_hugetlb +CATEGORY="hugetlb" run_test ./hugepage-mremap +CATEGORY="hugetlb" run_test ./hugepage-vmemmap +CATEGORY="hugetlb" run_test ./hugetlb-madvise + +nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) +# For this test, we need one and just one huge page +echo 1 > /proc/sys/vm/nr_hugepages +CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv +CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map +# Restore the previous number of huge pages, since further tests rely on it +echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages + +if test_selected "hugetlb"; then + echo "NOTE: These hugetlb tests provide minimal coverage. Use" | tap_prefix + echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" | tap_prefix + echo " hugetlb regression testing." | tap_prefix +fi + +CATEGORY="mmap" run_test ./map_fixed_noreplace + +if $RUN_ALL; then + run_gup_matrix +else + # get_user_pages_fast() benchmark + CATEGORY="gup_test" run_test ./gup_test -u + # pin_user_pages_fast() benchmark + CATEGORY="gup_test" run_test ./gup_test -a +fi +# Dump pages 0, 19, and 4096, using pin_user_pages: +CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 +CATEGORY="gup_test" run_test ./gup_longterm + +CATEGORY="userfaultfd" run_test ./uffd-unit-tests +uffd_stress_bin=./uffd-stress +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 +# Hugetlb tests require source and destination huge pages. Pass in half +# the size of the free pages we have, which is used for *each*. +half_ufd_size_MB=$((freepgs / 2)) +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16 + +#cleanup +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages + +CATEGORY="compaction" run_test ./compaction_test + +if command -v sudo &> /dev/null; +then + CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit +else + echo "# SKIP ./on-fault-limit" +fi + +CATEGORY="mmap" run_test ./map_populate + +CATEGORY="mlock" run_test ./mlock-random-test + +CATEGORY="mlock" run_test ./mlock2-tests + +CATEGORY="process_mrelease" run_test ./mrelease_test + +CATEGORY="mremap" run_test ./mremap_test + +CATEGORY="hugetlb" run_test ./thuge-gen +CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2 +CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2 +if $RUN_DESTRUCTIVE; then +CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison +fi + +if [ $VADDR64 -ne 0 ]; then + + # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel + # allows high virtual address allocation requests independent + # of platform's physical memory. + + prev_policy=$(cat /proc/sys/vm/overcommit_memory) + echo 1 > /proc/sys/vm/overcommit_memory + CATEGORY="hugevm" run_test ./virtual_address_range + echo $prev_policy > /proc/sys/vm/overcommit_memory + + # va high address boundary switch test + ARCH_ARM64="arm64" + prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) + if [ "$ARCH" == "$ARCH_ARM64" ]; then + echo 6 > /proc/sys/vm/nr_hugepages + fi + CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh + if [ "$ARCH" == "$ARCH_ARM64" ]; then + echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages + fi +fi # VADDR64 + +# vmalloc stability smoke test +CATEGORY="vmalloc" run_test bash ./test_vmalloc.sh smoke + +CATEGORY="mremap" run_test ./mremap_dontunmap + +CATEGORY="hmm" run_test bash ./test_hmm.sh smoke + +# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests +CATEGORY="madv_populate" run_test ./madv_populate + +(echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix +CATEGORY="memfd_secret" run_test ./memfd_secret + +# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 +CATEGORY="ksm" run_test ./ksm_tests -H -s 100 +# KSM KSM_MERGE_TIME test with size of 100 +CATEGORY="ksm" run_test ./ksm_tests -P -s 100 +# KSM MADV_MERGEABLE test with 10 identical pages +CATEGORY="ksm" run_test ./ksm_tests -M -p 10 +# KSM unmerge test +CATEGORY="ksm" run_test ./ksm_tests -U +# KSM test with 10 zero pages and use_zero_pages = 0 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 +# KSM test with 10 zero pages and use_zero_pages = 1 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 1 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 0 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 + +CATEGORY="ksm" run_test ./ksm_functional_tests + +# protection_keys tests +nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) +if [ -x ./protection_keys_32 ] +then + CATEGORY="pkey" run_test ./protection_keys_32 +fi + +if [ -x ./protection_keys_64 ] +then + CATEGORY="pkey" run_test ./protection_keys_64 +fi +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages + +if [ -x ./soft-dirty ] +then + CATEGORY="soft_dirty" run_test ./soft-dirty +fi + +CATEGORY="pagemap" run_test ./pagemap_ioctl + +# COW tests +CATEGORY="cow" run_test ./cow + +CATEGORY="thp" run_test ./khugepaged + +CATEGORY="thp" run_test ./khugepaged -s 2 + +CATEGORY="thp" run_test ./transhuge-stress -d 20 + +# Try to create XFS if not provided +if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then + if test_selected "thp"; then + if grep xfs /proc/filesystems &>/dev/null; then + XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX) + SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX) + truncate -s 314572800 ${XFS_IMG} + mkfs.xfs -q ${XFS_IMG} + mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + MOUNTED_XFS=1 + fi + fi +fi + +CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + +if [ -n "${MOUNTED_XFS}" ]; then + umount ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rmdir ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rm -f ${XFS_IMG} +fi + +CATEGORY="migration" run_test ./migration + +CATEGORY="mkdirty" run_test ./mkdirty + +CATEGORY="mdwe" run_test ./mdwe_test + +echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix +echo "1..${count_total}" | tap_output + +exit $exitcode diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c new file mode 100644 index 000000000000..f2babec79bb6 --- /dev/null +++ b/tools/testing/selftests/mm/seal_elf.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sys/mman.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <stdbool.h> +#include "../kselftest.h" +#include <syscall.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/vfs.h> +#include <sys/stat.h> + +/* + * need those definition for manually build using gcc. + * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 seal_elf.c -o seal_elf + */ +#define FAIL_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + +#define SKIP_TEST_IF_FALSE(c) do {\ + if (!(c)) {\ + ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\ + goto test_end;\ + } \ + } \ + while (0) + + +#define TEST_END_CHECK() {\ + ksft_test_result_pass("%s\n", __func__);\ + return;\ +test_end:\ + return;\ +} + +#ifndef u64 +#define u64 unsigned long long +#endif + +/* + * define sys_xyx to call syscall directly. + */ +static int sys_mseal(void *start, size_t len) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mseal, start, len, 0); + return sret; +} + +static void *sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + void *sret; + + errno = 0; + sret = (void *) syscall(__NR_mmap, addr, len, prot, + flags, fd, offset); + return sret; +} + +static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot) +{ + int sret; + + errno = 0; + sret = syscall(__NR_mprotect, ptr, size, prot); + return sret; +} + +static bool seal_support(void) +{ + int ret; + void *ptr; + unsigned long page_size = getpagesize(); + + ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == (void *) -1) + return false; + + ret = sys_mseal(ptr, page_size); + if (ret < 0) + return false; + + return true; +} + +const char somestr[4096] = {"READONLY"}; + +static void test_seal_elf(void) +{ + int ret; + FILE *maps; + char line[512]; + uintptr_t addr_start, addr_end; + char prot[5]; + char filename[256]; + unsigned long page_size = getpagesize(); + unsigned long long ptr = (unsigned long long) somestr; + char *somestr2 = (char *)somestr; + + /* + * Modify the protection of readonly somestr + */ + if (((unsigned long long)ptr % page_size) != 0) + ptr = (unsigned long long)ptr & ~(page_size - 1); + + ksft_print_msg("somestr = %s\n", somestr); + ksft_print_msg("change protection to rw\n"); + ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE); + FAIL_TEST_IF_FALSE(!ret); + *somestr2 = 'A'; + ksft_print_msg("somestr is modified to: %s\n", somestr); + ret = sys_mprotect((void *)ptr, page_size, PROT_READ); + FAIL_TEST_IF_FALSE(!ret); + + maps = fopen("/proc/self/maps", "r"); + FAIL_TEST_IF_FALSE(maps); + + /* + * apply sealing to elf binary + */ + while (fgets(line, sizeof(line), maps)) { + if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]", + &addr_start, &addr_end, prot, filename) == 4) { + if (strlen(filename)) { + /* + * seal the mapping if read only. + */ + if (strstr(prot, "r-")) { + ret = sys_mseal((void *)addr_start, addr_end - addr_start); + FAIL_TEST_IF_FALSE(!ret); + ksft_print_msg("sealed: %lx-%lx %s %s\n", + addr_start, addr_end, prot, filename); + if ((uintptr_t) somestr >= addr_start && + (uintptr_t) somestr <= addr_end) + ksft_print_msg("mapping for somestr found\n"); + } + } + } + } + fclose(maps); + + ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE); + FAIL_TEST_IF_FALSE(ret < 0); + ksft_print_msg("somestr is sealed, mprotect is rejected\n"); + + TEST_END_CHECK(); +} + +int main(int argc, char **argv) +{ + bool test_seal = seal_support(); + + ksft_print_header(); + ksft_print_msg("pid=%d\n", getpid()); + + if (!test_seal) + ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n"); + + ksft_set_plan(1); + + test_seal_elf(); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings new file mode 100644 index 000000000000..a953c96aa16e --- /dev/null +++ b/tools/testing/selftests/mm/settings @@ -0,0 +1 @@ +timeout=180 diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c new file mode 100644 index 000000000000..bdfa5d085f00 --- /dev/null +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <string.h> +#include <stdbool.h> +#include <fcntl.h> +#include <stdint.h> +#include <malloc.h> +#include <sys/mman.h> +#include "../kselftest.h" +#include "vm_util.h" + +#define PAGEMAP_FILE_PATH "/proc/self/pagemap" +#define TEST_ITERATIONS 10000 + +static void test_simple(int pagemap_fd, int pagesize) +{ + int i; + char *map; + + map = aligned_alloc(pagesize, pagesize); + if (!map) + ksft_exit_fail_msg("mmap failed\n"); + + clear_softdirty(); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + clear_softdirty(); + } + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +static void test_vma_reuse(int pagemap_fd, int pagesize) +{ + char *map, *map2; + + map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // The kernel always marks new regions as soft dirty + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s dirty bit of allocated page\n", __func__); + + clear_softdirty(); + munmap(map, pagesize); + + map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // Dirty bit is set for new regions even if they are reused + if (map == map2) + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s dirty bit of reused address page\n", __func__); + else + ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); + + munmap(map2, pagesize); +} + +static void test_hugepage(int pagemap_fd, int pagesize) +{ + char *map; + int i, ret; + size_t hpage_len = read_pmd_pagesize(); + + if (!hpage_len) + ksft_exit_fail_msg("Reading PMD pagesize failed"); + + map = memalign(hpage_len, hpage_len); + if (!map) + ksft_exit_fail_msg("memalign failed\n"); + + ret = madvise(map, hpage_len, MADV_HUGEPAGE); + if (ret) + ksft_exit_fail_msg("madvise failed %d\n", ret); + + for (i = 0; i < hpage_len; i++) + map[i] = (char)i; + + if (check_huge_anon(map, 1, hpage_len)) { + ksft_test_result_pass("Test %s huge page allocation\n", __func__); + + clear_softdirty(); + for (i = 0 ; i < TEST_ITERATIONS ; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + clear_softdirty(); + } + + ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); + } else { + // hugepage allocation failed. skip these tests + ksft_test_result_skip("Test %s huge page allocation\n", __func__); + ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); + } + free(map); +} + +static void test_mprotect(int pagemap_fd, int pagesize, bool anon) +{ + const char *type[] = {"file", "anon"}; + const char *fname = "./soft-dirty-test-file"; + int test_fd; + char *map; + + if (anon) { + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (!map) + ksft_exit_fail_msg("anon mmap failed\n"); + } else { + test_fd = open(fname, O_RDWR | O_CREAT, 0664); + if (test_fd < 0) { + ksft_test_result_skip("Test %s open() file failed\n", __func__); + return; + } + unlink(fname); + ftruncate(test_fd, pagesize); + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_SHARED, test_fd, 0); + if (!map) + ksft_exit_fail_msg("file mmap failed\n"); + } + + *map = 1; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s dirty bit of new written page\n", + __func__, type[anon]); + clear_softdirty(); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after clear_refs\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RO\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ|PROT_WRITE); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RW\n", + __func__, type[anon]); + *map = 2; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s soft-dirty after rewritten\n", + __func__, type[anon]); + + munmap(map, pagesize); + + if (!anon) + close(test_fd); +} + +static void test_mprotect_anon(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, true); +} + +static void test_mprotect_file(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, false); +} + +int main(int argc, char **argv) +{ + int pagemap_fd; + int pagesize; + + ksft_print_header(); + ksft_set_plan(15); + + pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); + + pagesize = getpagesize(); + + test_simple(pagemap_fd, pagesize); + test_vma_reuse(pagemap_fd, pagesize); + test_hugepage(pagemap_fd, pagesize); + test_mprotect_anon(pagemap_fd, pagesize); + test_mprotect_file(pagemap_fd, pagesize); + + close(pagemap_fd); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c new file mode 100644 index 000000000000..d3c7f5fb3e7b --- /dev/null +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via <debugfs>/split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <unistd.h> +#include <inttypes.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <malloc.h> +#include <stdbool.h> +#include <time.h> +#include "vm_util.h" +#include "../kselftest.h" + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" +#define INPUT_MAX 80 + +#define PID_FMT "%d,0x%lx,0x%lx,%d" +#define PATH_FMT "%s,0x%lx,0x%lx,%d" + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + +static void write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno)); + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + ksft_exit_fail_msg("Write failed\n"); +} + +static void write_debugfs(const char *fmt, ...) +{ + char input[INPUT_MAX]; + int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); + + if (ret >= INPUT_MAX) + ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__); + + write_file(SPLIT_DEBUGFS, input, ret + 1); +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + size_t i; + + one_page = memalign(pmd_pagesize, len); + if (!one_page) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len, 0); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) + ksft_exit_fail_msg("%ld byte corrupted\n", i); + + + if (!check_huge_anon(one_page, 0, pmd_pagesize)) + ksft_exit_fail_msg("Still AnonHugePages not split\n"); + + ksft_test_result_pass("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) + ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno)); + + pagemap_fd = open(pagemap_proc, O_RDONLY); + if (pagemap_fd == -1) + ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno)); + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + if (kpageflags_fd == -1) + ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno)); + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (one_page == MAP_FAILED) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno)); + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) + ksft_exit_fail_msg("Some THPs are missing during mremap\n"); + + /* split all remapped THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4, 0); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) + ksft_exit_fail_msg("%ld byte corrupted\n", i); + + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) + ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size); + + ksft_test_result_pass("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) + ksft_exit_fail_msg("Unable to create a tmpfs for testing\n"); + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); + } + + fd = open(testfile, O_CREAT|O_WRONLY, 0664); + if (fd == -1) { + ksft_perror("Cannot open testing file"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); + close(fd); + + if (num_written < 1) { + ksft_perror("Fail to write data to testing file"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); + + status = unlink(testfile); + if (status) { + ksft_perror("Cannot remove testing file"); + goto cleanup; + } + + status = umount(tmpfs_loc); + if (status) { + rmdir(tmpfs_loc); + ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc); + } + + status = rmdir(tmpfs_loc); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno)); + + ksft_print_msg("Please check dmesg for more information\n"); + ksft_test_result_pass("File-backed THP split test done\n"); + return; + +cleanup: + umount(tmpfs_loc); + rmdir(tmpfs_loc); + ksft_exit_fail_msg("Error occurred\n"); +} + +bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, + const char **thp_fs_loc) +{ + if (xfs_path) { + *thp_fs_loc = xfs_path; + return false; + } + + *thp_fs_loc = mkdtemp(thp_fs_template); + + if (!*thp_fs_loc) + ksft_exit_fail_msg("cannot create temp folder\n"); + + return true; +} + +void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) +{ + int status; + + if (!created_tmp) + return; + + status = rmdir(thp_fs_loc); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", + strerror(errno)); +} + +int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, + char **addr) +{ + size_t i; + int __attribute__((unused)) dummy = 0; + + srand(time(NULL)); + + *fd = open(testfile, O_CREAT | O_RDWR, 0664); + if (*fd == -1) + ksft_exit_fail_msg("Failed to create a file at %s\n", testfile); + + for (i = 0; i < fd_size; i++) { + unsigned char byte = (unsigned char)i; + + write(*fd, &byte, sizeof(byte)); + } + close(*fd); + sync(); + *fd = open("/proc/sys/vm/drop_caches", O_WRONLY); + if (*fd == -1) { + ksft_perror("open drop_caches"); + goto err_out_unlink; + } + if (write(*fd, "3", 1) != 1) { + ksft_perror("write to drop_caches"); + goto err_out_unlink; + } + close(*fd); + + *fd = open(testfile, O_RDWR); + if (*fd == -1) { + ksft_perror("Failed to open testfile\n"); + goto err_out_unlink; + } + + *addr = mmap(NULL, fd_size, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); + if (*addr == (char *)-1) { + ksft_perror("cannot mmap"); + goto err_out_close; + } + madvise(*addr, fd_size, MADV_HUGEPAGE); + + for (size_t i = 0; i < fd_size; i++) + dummy += *(*addr + i); + + if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { + ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); + munmap(*addr, fd_size); + close(*fd); + unlink(testfile); + ksft_test_result_skip("Pagecache folio split skipped\n"); + return -2; + } + return 0; +err_out_close: + close(*fd); +err_out_unlink: + unlink(testfile); + ksft_exit_fail_msg("Failed to create large pagecache folios\n"); + return -1; +} + +void split_thp_in_pagecache_to_order(size_t fd_size, int order, const char *fs_loc) +{ + int fd; + char *addr; + size_t i; + char testfile[INPUT_MAX]; + int err = 0; + + err = snprintf(testfile, INPUT_MAX, "%s/test", fs_loc); + + if (err < 0) + ksft_exit_fail_msg("cannot generate right test file name\n"); + + err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr); + if (err) + return; + err = 0; + + write_debugfs(PID_FMT, getpid(), (uint64_t)addr, (uint64_t)addr + fd_size, order); + + for (i = 0; i < fd_size; i++) + if (*(addr + i) != (char)i) { + ksft_print_msg("%lu byte corrupted in the file\n", i); + err = EXIT_FAILURE; + goto out; + } + + if (!check_huge_file(addr, 0, pmd_pagesize)) { + ksft_print_msg("Still FilePmdMapped not split\n"); + err = EXIT_FAILURE; + goto out; + } + +out: + munmap(addr, fd_size); + close(fd); + unlink(testfile); + if (err) + ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d failed\n", order); + ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d passed\n", order); +} + +int main(int argc, char **argv) +{ + int i; + size_t fd_size; + char *optional_xfs_path = NULL; + char fs_loc_template[] = "/tmp/thp_fs_XXXXXX"; + const char *fs_loc; + bool created_tmp; + + ksft_print_header(); + + if (geteuid() != 0) { + ksft_print_msg("Please run the benchmark as root\n"); + ksft_finished(); + } + + if (argc > 1) + optional_xfs_path = argv[1]; + + ksft_set_plan(3+9); + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + if (!pmd_pagesize) + ksft_exit_fail_msg("Reading PMD pagesize failed\n"); + + fd_size = 2 * pmd_pagesize; + + split_pmd_thp(); + split_pte_mapped_thp(); + split_file_backed_thp(); + + created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, + &fs_loc); + for (i = 8; i >= 0; i--) + split_thp_in_pagecache_to_order(fd_size, i, fs_loc); + cleanup_thp_fs(fs_loc, created_tmp); + + ksft_finished(); + + return 0; +} diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh index 0647b525a625..46e19b5d648d 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/mm/test_hmm.sh @@ -40,25 +40,30 @@ check_test_requirements() load_driver() { - modprobe $DRIVER > /dev/null 2>&1 - if [ $? == 0 ]; then - major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) - mknod /dev/hmm_dmirror0 c $major 0 - mknod /dev/hmm_dmirror1 c $major 1 + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi fi } unload_driver() { modprobe -r $DRIVER > /dev/null 2>&1 - rm -f /dev/hmm_dmirror? } run_smoke() { echo "Running smoke test. Note, this test provides basic coverage." - load_driver + load_driver $1 $2 $(dirname "${BASH_SOURCE[0]}")/hmm-tests unload_driver } @@ -75,6 +80,9 @@ usage() echo "# Smoke testing" echo "./${TEST_NAME}.sh smoke" echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>" + echo exit 0 } @@ -84,7 +92,7 @@ function run_test() usage else if [ "$1" = "smoke" ]; then - run_smoke + run_smoke $2 $3 else usage fi diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh index 06d2bb109f06..d73b846736f1 100755 --- a/tools/testing/selftests/vm/test_vmalloc.sh +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -11,6 +11,7 @@ TEST_NAME="vmalloc" DRIVER="test_${TEST_NAME}" +NUM_CPUS=`grep -c ^processor /proc/cpuinfo` # 1 if fails exitcode=1 @@ -22,9 +23,9 @@ ksft_skip=4 # Static templates for performance, stressing and smoke tests. # Also it is possible to pass any supported parameters manualy. # -PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" -SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" -STRESS_PARAM="test_repeat_count=20" +PERF_PARAM="sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" check_test_requirements() { @@ -58,8 +59,8 @@ run_perfformance_check() run_stability_check() { - echo "Run stability tests. In order to stress vmalloc subsystem we run" - echo "all available test cases on all available CPUs simultaneously." + echo "Run stability tests. In order to stress vmalloc subsystem all" + echo "available test cases are run by NUM_CPUS workers simultaneously." echo "It will take time, so be patient." modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 @@ -92,17 +93,17 @@ usage() echo "# Shows help message" echo "./${DRIVER}.sh" echo - echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" - echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" + echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" echo echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " echo "sequential order" - echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " + echo -n "./${DRIVER}.sh sequential_test_order=1 " echo "run_test_mask=23" echo - echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " + echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " echo "20 times" - echo "./${DRIVER}.sh test_repeat_count=20" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" echo echo "# Performance analysis" echo "./${DRIVER}.sh performance" diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c new file mode 100644 index 000000000000..a4163438108e --- /dev/null +++ b/tools/testing/selftests/mm/thp_settings.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "thp_settings.h" + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define MAX_SETTINGS_DEPTH 4 +static struct thp_settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; +static struct thp_settings saved_settings; +static char dev_queue_read_ahead_path[PATH_MAX]; + +static const char * const thp_enabled_strings[] = { + "never", + "always", + "inherit", + "madvise", + NULL +}; + +static const char * const thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +static const char * const shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); + return 0; + } + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); + return 0; + } + + return (unsigned int) numwritten; +} + +const unsigned long read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file()"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +void write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +int thp_read_string(const char *name, const char * const strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +void thp_write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +const unsigned long thp_read_num(const char *name) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return read_num(path); +} + +void thp_write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + write_num(path, num); +} + +void thp_read_settings(struct thp_settings *settings) +{ + unsigned long orders = thp_supported_orders(); + char path[PATH_MAX]; + int i; + + *settings = (struct thp_settings) { + .thp_enabled = thp_read_string("enabled", thp_enabled_strings), + .thp_defrag = thp_read_string("defrag", thp_defrag_strings), + .shmem_enabled = + thp_read_string("shmem_enabled", shmem_enabled_strings), + .use_zero_page = thp_read_num("use_zero_page"), + }; + settings->khugepaged = (struct khugepaged_settings) { + .defrag = thp_read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + thp_read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + thp_read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = thp_read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = thp_read_num("khugepaged/pages_to_scan"), + }; + if (dev_queue_read_ahead_path[0]) + settings->read_ahead_kb = read_num(dev_queue_read_ahead_path); + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & orders)) { + settings->hugepages[i].enabled = THP_NEVER; + continue; + } + snprintf(path, PATH_MAX, "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + settings->hugepages[i].enabled = + thp_read_string(path, thp_enabled_strings); + } +} + +void thp_write_settings(struct thp_settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + unsigned long orders = thp_supported_orders(); + char path[PATH_MAX]; + int enabled; + int i; + + thp_write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + thp_write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + thp_write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + thp_write_num("use_zero_page", settings->use_zero_page); + + thp_write_num("khugepaged/defrag", khugepaged->defrag); + thp_write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + thp_write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + thp_write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + thp_write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + thp_write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + thp_write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (dev_queue_read_ahead_path[0]) + write_num(dev_queue_read_ahead_path, settings->read_ahead_kb); + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & orders)) + continue; + snprintf(path, PATH_MAX, "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + enabled = settings->hugepages[i].enabled; + thp_write_string(path, thp_enabled_strings[enabled]); + } +} + +struct thp_settings *thp_current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +void thp_push_settings(struct thp_settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + thp_write_settings(thp_current_settings()); +} + +void thp_pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + thp_write_settings(thp_current_settings()); +} + +void thp_restore_settings(void) +{ + thp_write_settings(&saved_settings); +} + +void thp_save_settings(void) +{ + thp_read_settings(&saved_settings); +} + +void thp_set_read_ahead_path(char *path) +{ + if (!path) { + dev_queue_read_ahead_path[0] = '\0'; + return; + } + + strncpy(dev_queue_read_ahead_path, path, + sizeof(dev_queue_read_ahead_path)); + dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; +} + +unsigned long thp_supported_orders(void) +{ + unsigned long orders = 0; + char path[PATH_MAX]; + char buf[256]; + int ret; + int i; + + for (i = 0; i < NR_ORDERS; i++) { + ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret) + orders |= 1UL << i; + } + + return orders; +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h new file mode 100644 index 000000000000..71cbff05f4c7 --- /dev/null +++ b/tools/testing/selftests/mm/thp_settings.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __THP_SETTINGS_H__ +#define __THP_SETTINGS_H__ + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +enum thp_enabled { + THP_NEVER, + THP_ALWAYS, + THP_INHERIT, + THP_MADVISE, +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +#define NR_ORDERS 20 + +struct hugepages_settings { + enum thp_enabled enabled; +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct thp_settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool use_zero_page; + struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; + struct hugepages_settings hugepages[NR_ORDERS]; +}; + +int read_file(const char *path, char *buf, size_t buflen); +int write_file(const char *path, const char *buf, size_t buflen); +const unsigned long read_num(const char *path); +void write_num(const char *path, unsigned long num); + +int thp_read_string(const char *name, const char * const strings[]); +void thp_write_string(const char *name, const char *val); +const unsigned long thp_read_num(const char *name); +void thp_write_num(const char *name, unsigned long num); + +void thp_write_settings(struct thp_settings *settings); +void thp_read_settings(struct thp_settings *settings); +struct thp_settings *thp_current_settings(void); +void thp_push_settings(struct thp_settings *settings); +void thp_pop_settings(void); +void thp_restore_settings(void); +void thp_save_settings(void); + +void thp_set_read_ahead_path(char *path); +unsigned long thp_supported_orders(void); + +#endif /* __THP_SETTINGS_H__ */ diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c new file mode 100644 index 000000000000..ea7fd8fe2876 --- /dev/null +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test selecting other page sizes for mmap/shmget. + + Before running this huge pages for each huge page size must have been + reserved. + For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must + be used. 1GB wouldn't be tested if it isn't available. + Also shmmax must be increased. + And you need to run as root to work around some weird permissions in shm. + And nothing using huge pages should run in parallel. + When the program aborts you may need to clean up the shm segments with + ipcrm -m by hand, like this + sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m + (warning this will remove all if someone else uses them) */ + +#define _GNU_SOURCE 1 +#include <sys/mman.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/stat.h> +#include <glob.h> +#include <assert.h> +#include <unistd.h> +#include <stdarg.h> +#include <string.h> +#include "vm_util.h" +#include "../kselftest.h" + +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) +#define MAP_HUGE_SHIFT 26 +#define MAP_HUGE_MASK 0x3f +#if !defined(MAP_HUGETLB) +#define MAP_HUGETLB 0x40000 +#endif + +#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ +#define SHM_HUGE_SHIFT 26 +#define SHM_HUGE_MASK 0x3f +#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) +#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) + +#define NUM_PAGESIZES 5 +#define NUM_PAGES 4 + +unsigned long page_sizes[NUM_PAGESIZES]; +int num_page_sizes; + +int ilog2(unsigned long v) +{ + int l = 0; + while ((1UL << l) < v) + l++; + return l; +} + +void show(unsigned long ps) +{ + char buf[100]; + + if (ps == getpagesize()) + return; + + ksft_print_msg("%luMB: ", ps >> 20); + + fflush(stdout); + snprintf(buf, sizeof buf, + "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); + system(buf); +} + +unsigned long read_sysfs(int warn, char *fmt, ...) +{ + char *line = NULL; + size_t linelen = 0; + char buf[100]; + FILE *f; + va_list ap; + unsigned long val = 0; + + va_start(ap, fmt); + vsnprintf(buf, sizeof buf, fmt, ap); + va_end(ap); + + f = fopen(buf, "r"); + if (!f) { + if (warn) + ksft_print_msg("missing %s\n", buf); + return 0; + } + if (getline(&line, &linelen, f) > 0) { + sscanf(line, "%lu", &val); + } + fclose(f); + free(line); + return val; +} + +unsigned long read_free(unsigned long ps) +{ + return read_sysfs(ps != getpagesize(), + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); +} + +void test_mmap(unsigned long size, unsigned flags) +{ + char *map; + unsigned long before, after; + + before = read_free(size); + map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); + + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + + show(size); + ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + "%s mmap\n", __func__); + + if (munmap(map, size * NUM_PAGES)) + ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno)); +} + +void test_shmget(unsigned long size, unsigned flags) +{ + int id; + unsigned long before, after; + struct shm_info i; + char *map; + + before = read_free(size); + id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); + if (id < 0) { + if (errno == EPERM) { + ksft_test_result_skip("shmget requires root privileges: %s\n", + strerror(errno)); + return; + } + ksft_exit_fail_msg("shmget: %s\n", strerror(errno)); + } + + if (shmctl(id, SHM_INFO, (void *)&i) < 0) + ksft_exit_fail_msg("shmctl: %s\n", strerror(errno)); + + map = shmat(id, NULL, 0600); + if (map == MAP_FAILED) + ksft_exit_fail_msg("shmat: %s\n", strerror(errno)); + + shmctl(id, IPC_RMID, NULL); + + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + + show(size); + ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + "%s: mmap\n", __func__); + if (shmdt(map)) + ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno)); +} + +void find_pagesizes(void) +{ + unsigned long largest = getpagesize(); + int i; + glob_t g; + + glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); + assert(g.gl_pathc <= NUM_PAGESIZES); + for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) { + sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", + &page_sizes[num_page_sizes]); + page_sizes[num_page_sizes] <<= 10; + ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20); + + if (page_sizes[num_page_sizes] > largest) + largest = page_sizes[i]; + + if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES) + num_page_sizes++; + else + ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n", + page_sizes[num_page_sizes] >> 20, NUM_PAGES); + } + globfree(&g); + + if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) + ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax", + largest * NUM_PAGES); + +#if defined(__x86_64__) + if (largest != 1U<<30) { + ksft_exit_fail_msg("No GB pages available on x86-64\n" + "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); + } +#endif +} + +int main(void) +{ + unsigned default_hps = default_huge_page_size(); + int i; + + ksft_print_header(); + + find_pagesizes(); + + if (!num_page_sizes) + ksft_finished(); + + ksft_set_plan(2 * num_page_sizes + 3); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << MAP_HUGE_SHIFT; + + ksft_print_msg("Testing %luMB mmap with shift %x\n", ps >> 20, arg); + test_mmap(ps, MAP_HUGETLB | arg); + } + + ksft_print_msg("Testing default huge mmap\n"); + test_mmap(default_hps, MAP_HUGETLB); + + ksft_print_msg("Testing non-huge shmget\n"); + test_shmget(getpagesize(), 0); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << SHM_HUGE_SHIFT; + ksft_print_msg("Testing %luMB shmget with shift %x\n", ps >> 20, arg); + test_shmget(ps, SHM_HUGETLB | arg); + } + + ksft_print_msg("default huge shmget\n"); + test_shmget(default_hps, SHM_HUGETLB); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c new file mode 100644 index 000000000000..68201192e37c --- /dev/null +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -0,0 +1,138 @@ +/* + * Stress test for transparent huge pages, memory compaction and migration. + * + * Authors: Konstantin Khlebnikov <koct9i@gmail.com> + * + * This is free and unencumbered software released into the public domain. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <err.h> +#include <time.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <sys/mman.h> +#include "vm_util.h" +#include "../kselftest.h" + +int backing_fd = -1; +int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; +#define PROT_RW (PROT_READ | PROT_WRITE) + +int main(int argc, char **argv) +{ + size_t ram, len; + void *ptr, *p; + struct timespec start, a, b; + int i = 0; + char *name = NULL; + double s; + uint8_t *map; + size_t map_len; + int pagemap_fd; + int duration = 0; + + ksft_print_header(); + + ram = sysconf(_SC_PHYS_PAGES); + if (ram > SIZE_MAX / psize() / 4) + ram = SIZE_MAX / 4; + else + ram *= psize(); + len = ram; + + while (++i < argc) { + if (!strcmp(argv[i], "-h")) + ksft_exit_fail_msg("usage: %s [-f <filename>] [-d <duration>] [size in MiB]\n", + argv[0]); + else if (!strcmp(argv[i], "-f")) + name = argv[++i]; + else if (!strcmp(argv[i], "-d")) + duration = atoi(argv[++i]); + else + len = atoll(argv[i]) << 20; + } + + ksft_set_plan(1); + + if (name) { + backing_fd = open(name, O_RDWR); + if (backing_fd == -1) + ksft_exit_fail_msg("open %s\n", name); + mmap_flags = MAP_SHARED; + } + + warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" + " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, + ram >> (20 + HPAGE_SHIFT - pshift() - 1)); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("open pagemap\n"); + + len -= len % HPAGE_SIZE; + ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); + if (ptr == MAP_FAILED) + ksft_exit_fail_msg("initial mmap"); + ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; + + if (madvise(ptr, len, MADV_HUGEPAGE)) + ksft_exit_fail_msg("MADV_HUGEPAGE"); + + map_len = ram >> (HPAGE_SHIFT - 1); + map = malloc(map_len); + if (!map) + ksft_exit_fail_msg("map malloc\n"); + + clock_gettime(CLOCK_MONOTONIC, &start); + + while (1) { + int nr_succeed = 0, nr_failed = 0, nr_pages = 0; + + memset(map, 0, map_len); + + clock_gettime(CLOCK_MONOTONIC, &a); + for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { + int64_t pfn; + + pfn = allocate_transhuge(p, pagemap_fd); + + if (pfn < 0) { + nr_failed++; + } else { + size_t idx = pfn >> (HPAGE_SHIFT - pshift()); + + nr_succeed++; + if (idx >= map_len) { + map = realloc(map, idx + 1); + if (!map) + ksft_exit_fail_msg("map realloc\n"); + memset(map + map_len, 0, idx + 1 - map_len); + map_len = idx + 1; + } + if (!map[idx]) + nr_pages++; + map[idx] = 1; + } + + /* split transhuge page, keep last page */ + if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED)) + ksft_exit_fail_msg("MADV_DONTNEED"); + } + clock_gettime(CLOCK_MONOTONIC, &b); + s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; + + ksft_print_msg("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" + "%4d succeed, %4d failed, %4d different pages\n", + s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), + nr_succeed, nr_failed, nr_pages); + + if (duration > 0 && b.tv_sec - start.tv_sec >= duration) { + ksft_test_result_pass("Completed\n"); + ksft_finished(); + } + } +} diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c new file mode 100644 index 000000000000..7ad6ba660c7d --- /dev/null +++ b/tools/testing/selftests/mm/uffd-common.c @@ -0,0 +1,718 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd tests util functions + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ + +#include "uffd-common.h" + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +volatile bool test_uffdio_copy_eexist = true; +unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; +char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +int uffd = -1, uffd_flags, finished, *pipefd, test_type; +bool map_shared; +bool test_uffdio_wp = true; +unsigned long long *count_verify; +uffd_test_ops_t *uffd_test_ops; +uffd_test_case_ops_t *uffd_test_case_ops; +atomic_bool ready_for_fork; + +static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) +{ + unsigned int memfd_flags = 0; + int mem_fd; + + if (hugetlb) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create("uffd-test", memfd_flags); + if (mem_fd < 0) + err("memfd_create"); + if (ftruncate(mem_fd, mem_size)) + err("ftruncate"); + if (fallocate(mem_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + mem_size)) + err("fallocate"); + + return mem_fd; +} + +static void anon_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static int anon_allocate_area(void **alloc_area, bool is_src) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (*alloc_area == MAP_FAILED) { + *alloc_area = NULL; + return -errno; + } + return 0; +} + +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ +} + +static void hugetlb_release_pages(char *rel_area) +{ + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } +} + +static int hugetlb_allocate_area(void **alloc_area, bool is_src) +{ + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; + void *area_alias = NULL; + char **alloc_area_alias; + int mem_fd = uffd_mem_fd_create(size * 2, true); + + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); + if (*alloc_area == MAP_FAILED) { + *alloc_area = NULL; + return -errno; + } + + if (map_shared) { + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); + if (area_alias == MAP_FAILED) + return -errno; + } + + if (is_src) { + alloc_area_alias = &area_src_alias; + } else { + alloc_area_alias = &area_dst_alias; + } + if (area_alias) + *alloc_area_alias = area_alias; + + close(mem_fd); + return 0; +} + +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + if (!map_shared) + return; + + *start = (unsigned long) area_dst_alias + offset; +} + +static void shmem_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); +} + +static int shmem_allocate_area(void **alloc_area, bool is_src) +{ + void *area_alias = NULL; + size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + int mem_fd = uffd_mem_fd_create(bytes * 2, false); + + /* TODO: clean this up. Use a static addr is ugly */ + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (*alloc_area == MAP_FAILED) { + *alloc_area = NULL; + return -errno; + } + if (*alloc_area != p) + err("mmap of memfd failed at %p", p); + + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (area_alias == MAP_FAILED) { + munmap(*alloc_area, bytes); + *alloc_area = NULL; + return -errno; + } + if (area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; + + close(mem_fd); + return 0; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, + read_pmd_pagesize())) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + +struct uffd_test_ops anon_uffd_test_ops = { + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, + .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, +}; + +struct uffd_test_ops shmem_uffd_test_ops = { + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, + .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, +}; + +struct uffd_test_ops hugetlb_uffd_test_ops = { + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, + .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, +}; + +void uffd_stats_report(struct uffd_args *args, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += args[i].missing_faults; + wp_total += args[i].wp_faults; + minor_total += args[i].minor_faults; + } + + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", args[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", args[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", args[i].minor_faults); + printf("\b)"); + } + printf("\n"); +} + +int userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + uffd = uffd_open(UFFD_FLAGS); + if (uffd < 0) + return -1; + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + /* Probably lack of CAP_PTRACE? */ + return -1; + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; + return 0; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); +} + +int uffd_test_ctx_init(uint64_t features, const char **errmsg) +{ + unsigned long nr, cpu; + int ret; + + if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) { + ret = uffd_test_case_ops->pre_alloc(errmsg); + if (ret) + return ret; + } + + ret = uffd_test_ops->allocate_area((void **)&area_src, true); + ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); + if (ret) { + if (errmsg) + *errmsg = "memory allocation failed"; + return ret; + } + + if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) { + ret = uffd_test_case_ops->post_alloc(errmsg); + if (ret) + return ret; + } + + ret = userfaultfd_open(&features); + if (ret) { + if (errmsg) + *errmsg = "possible lack of priviledge"; + return ret; + } + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); + + return 0; +} + +void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); +} + +static void continue_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_continue req; + int ret; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + if (wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); +} + +int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + return 1; + err("blocking read error"); + } else { + err("short read"); + } + } + + return 0; +} + +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + args->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size, + args->apply_wp); + args->minor_faults++; + } else { + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + if (copy_page(uffd, offset, args->apply_wp)) + args->missing_faults++; + } +} + +void *uffd_poll_thread(void *arg) +{ + struct uffd_args *args = (struct uffd_args *)arg; + unsigned long cpu = args->cpu; + struct pollfd pollfd[2]; + struct uffd_msg msg; + struct uffdio_register uffd_reg; + int ret; + char tmp_chr; + + if (!args->handle_fault) + args->handle_fault = uffd_handle_page_fault; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + ready_for_fork = true; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + err("poll error: %d", ret); + } + if (pollfd[1].revents) { + if (!(pollfd[1].revents & POLLIN)) + err("pollfd[1].revents %d", pollfd[1].revents); + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); + break; + } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); + if (uffd_read_msg(uffd, &msg)) + continue; + switch (msg.event) { + default: + err("unexpected msg event %u\n", msg.event); + break; + case UFFD_EVENT_PAGEFAULT: + args->handle_fault(&msg, args); + break; + case UFFD_EVENT_FORK: + close(uffd); + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); + break; + case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } + } + + return NULL; +} + +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffdio_copy->len, + offset); + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + +int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + if (wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); + } else if (uffdio_copy.copy != page_size) { + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + } else { + if (test_uffdio_copy_eexist && retry) { + test_uffdio_copy_eexist = false; + retry_copy_page(ufd, &uffdio_copy, offset); + } + return 1; + } + return 0; +} + +int copy_page(int ufd, unsigned long offset, bool wp) +{ + return __copy_page(ufd, offset, false, wp); +} + +int move_page(int ufd, unsigned long offset, unsigned long len) +{ + struct uffdio_move uffdio_move; + + if (offset + len > nr_pages * page_size) + err("unexpected offset %lu and length %lu\n", offset, len); + uffdio_move.dst = (unsigned long) area_dst + offset; + uffdio_move.src = (unsigned long) area_src + offset; + uffdio_move.len = len; + uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES; + uffdio_move.move = 0; + if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) { + /* real retval in uffdio_move.move */ + if (uffdio_move.move != -EEXIST) + err("UFFDIO_MOVE error: %"PRId64, + (int64_t)uffdio_move.move); + wake_range(ufd, uffdio_move.dst, len); + } else if (uffdio_move.move != len) { + err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); + } else + return 1; + return 0; +} + +int uffd_open_dev(unsigned int flags) +{ + int fd, uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + return fd; + uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags); + close(fd); + + return uffd; +} + +int uffd_open_sys(unsigned int flags) +{ +#ifdef __NR_userfaultfd + return syscall(__NR_userfaultfd, flags); +#else + return -1; +#endif +} + +int uffd_open(unsigned int flags) +{ + int uffd = uffd_open_sys(flags); + + if (uffd < 0) + uffd = uffd_open_dev(flags); + + return uffd; +} + +int uffd_get_features(uint64_t *features) +{ + struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 }; + /* + * This should by default work in most kernels; the feature list + * will be the same no matter what we pass in here. + */ + int fd = uffd_open(UFFD_USER_MODE_ONLY); + + if (fd < 0) + /* Maybe the kernel is older than user-only mode? */ + fd = uffd_open(0); + + if (fd < 0) + return fd; + + if (ioctl(fd, UFFDIO_API, &uffdio_api)) { + close(fd); + return -errno; + } + + *features = uffdio_api.features; + close(fd); + + return 0; +} diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h new file mode 100644 index 000000000000..a70ae10b5f62 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-common.h @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd tests common header + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ +#ifndef __UFFD_COMMON_H__ +#define __UFFD_COMMON_H__ + +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ // Use ll64 +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <signal.h> +#include <poll.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <pthread.h> +#include <linux/userfaultfd.h> +#include <setjmp.h> +#include <stdbool.h> +#include <assert.h> +#include <inttypes.h> +#include <stdint.h> +#include <sys/random.h> +#include <stdatomic.h> + +#include "../kselftest.h" +#include "vm_util.h" + +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, @%s:%d)\n", \ + ret, __FILE__, __LINE__); \ + } while (0) + +#define errexit(exitcode, fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(exitcode); \ + } while (0) + +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +/* Userfaultfd test statistics */ +struct uffd_args { + int cpu; + /* Whether apply wr-protects when installing pages */ + bool apply_wp; + unsigned long missing_faults; + unsigned long wp_faults; + unsigned long minor_faults; + + /* A custom fault handler; defaults to uffd_handle_page_fault. */ + void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args); +}; + +struct uffd_test_ops { + int (*allocate_area)(void **alloc_area, bool is_src); + void (*release_pages)(char *rel_area); + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); +}; +typedef struct uffd_test_ops uffd_test_ops_t; + +struct uffd_test_case_ops { + int (*pre_alloc)(const char **errmsg); + int (*post_alloc)(const char **errmsg); +}; +typedef struct uffd_test_case_ops uffd_test_case_ops_t; + +extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; +extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +extern int uffd, uffd_flags, finished, *pipefd, test_type; +extern bool map_shared; +extern bool test_uffdio_wp; +extern unsigned long long *count_verify; +extern volatile bool test_uffdio_copy_eexist; +extern atomic_bool ready_for_fork; + +extern uffd_test_ops_t anon_uffd_test_ops; +extern uffd_test_ops_t shmem_uffd_test_ops; +extern uffd_test_ops_t hugetlb_uffd_test_ops; +extern uffd_test_ops_t *uffd_test_ops; +extern uffd_test_case_ops_t *uffd_test_case_ops; + +void uffd_stats_report(struct uffd_args *args, int n_cpus); +int uffd_test_ctx_init(uint64_t features, const char **errmsg); +void uffd_test_ctx_clear(void); +int userfaultfd_open(uint64_t *features); +int uffd_read_msg(int ufd, struct uffd_msg *msg); +void wp_range(int ufd, __u64 start, __u64 len, bool wp); +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); +int __copy_page(int ufd, unsigned long offset, bool retry, bool wp); +int copy_page(int ufd, unsigned long offset, bool wp); +int move_page(int ufd, unsigned long offset, unsigned long len); +void *uffd_poll_thread(void *arg); + +int uffd_open_dev(unsigned int flags); +int uffd_open_sys(unsigned int flags); +int uffd_open(unsigned int flags); +int uffd_get_features(uint64_t *features); + +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 + +#endif diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c new file mode 100644 index 000000000000..f78bab0f3d45 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -0,0 +1,480 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + */ + +#include "uffd-common.h" + +#ifdef __NR_userfaultfd + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ +#define ALARM_INTERVAL_SECS 10 +static char *zeropage; +pthread_attr_t attr; + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./uffd-stress anon 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./uffd-stress shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./uffd-stress hugetlb 256 50\n\n" + "# Run the same hugetlb test but using private file:\n" + "./uffd-stress hugetlb-private 256 50\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./uffd-stress anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./uffd-stress <test type> <MiB> <bounces>\n\n"); + fprintf(stderr, "Supported <test type>: anon, hugetlb, " + "hugetlb-private, shmem, shmem-private\n\n"); + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, "%s", examples); + exit(1); +} + +static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + args[i].cpu = i; + args[i].apply_wp = test_uffdio_wp; + args[i].missing_faults = 0; + args[i].wp_faults = 0; + args[i].minor_faults = 0; + } +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + unsigned long long count; + + if (!(bounces & BOUNCE_RANDOM)) { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); + } else + page_nr += 1; + page_nr %= nr_pages; + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) + err("page_nr %lu memory corruption %llu %llu", + page_nr, count, count_verify[page_nr]); + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + } + + return NULL; +} + +static int copy_page_retry(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, true, test_uffdio_wp); +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + struct uffd_args *args = (struct uffd_args *)arg; + struct uffd_msg msg; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + if (uffd_read_msg(uffd, &msg)) + continue; + uffd_handle_page_fault(&msg, args); + } + + return NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; + + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + return NULL; +} + +static int stress(struct uffd_args *args) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, &args[cpu])) + err("uffd_poll_thread create"); + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + (void *)&args[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + uffd_test_ops->release_pages(area_src); + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) + err("pipefd write error"); + if (pthread_join(uffd_threads[cpu], + (void *)&args[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + return 0; +} + +static int userfaultfd_stress(void) +{ + void *area; + unsigned long nr; + struct uffd_args args[nr_cpus]; + uint64_t mem_size = nr_pages * page_size; + + memset(args, 0, sizeof(struct uffd_args) * nr_cpus); + + if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL)) + err("context init failed"); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + else + printf(" read"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + if (uffd_register(uffd, area_dst, mem_size, + true, test_uffdio_wp, false)) + err("register failure"); + + if (area_dst_alias) { + if (uffd_register(uffd, area_dst_alias, mem_size, + true, test_uffdio_wp, false)) + err("register failure alias"); + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + uffd_test_ops->release_pages(area_dst); + + uffd_stats_reset(args, nr_cpus); + + /* bounce pass */ + if (stress(args)) { + uffd_test_ctx_clear(); + return 1; + } + + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + + /* unregister */ + if (uffd_unregister(uffd, area_dst, mem_size)) + err("unregister failure"); + if (area_dst_alias) { + if (uffd_unregister(uffd, area_dst_alias, mem_size)) + err("unregister failure alias"); + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) + for (nr = 0; nr < nr_pages; nr++) + if (*area_count(area_dst, nr) != count_verify[nr]) + err("error area_count %llu %llu %lu\n", + *area_count(area_src, nr), + count_verify[nr], nr); + + /* prepare next bounce */ + swap(area_src, area_dst); + + swap(area_src_alias, area_dst_alias); + + uffd_stats_report(args, nr_cpus); + } + uffd_test_ctx_clear(); + + return 0; +} + +static void set_test_type(const char *type) +{ + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + map_shared = true; + } else if (!strcmp(type, "hugetlb-private")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "shmem")) { + map_shared = true; + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + } else if (!strcmp(type, "shmem-private")) { + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + } +} + +static void parse_test_type_arg(const char *raw_type) +{ + uint64_t features = UFFD_API_FEATURES; + + set_test_type(raw_type); + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); + + if (test_type == TEST_HUGETLB) + page_size = default_huge_page_size(); + else + page_size = sysconf(_SC_PAGE_SIZE); + + if (!page_size) + err("Unable to determine page size"); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + if (userfaultfd_open(&features)) + err("Userfaultfd open failed"); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + + close(uffd); + uffd = -1; +} + +static void sigalrm(int sig) +{ + if (sig != SIGALRM) + abort(); + test_uffdio_copy_eexist = true; + alarm(ALARM_INTERVAL_SECS); +} + +int main(int argc, char **argv) +{ + size_t bytes; + + if (argc < 4) + usage(); + + if (signal(SIGALRM, sigalrm) == SIG_ERR) + err("failed to arm SIGALRM"); + alarm(ALARM_INTERVAL_SECS); + + parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_type == TEST_HUGETLB && + get_free_hugepages() < bytes / page_size) { + printf("skip: Skipping userfaultfd... not enough hugepages\n"); + return KSFT_SKIP; + } + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + + nr_pages_per_cpu = bytes / page_size / nr_cpus; + if (!nr_pages_per_cpu) { + _err("invalid MiB"); + usage(); + } + + bounces = atoi(argv[3]); + if (bounces <= 0) { + _err("invalid bounces"); + usage(); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c new file mode 100644 index 000000000000..21ec23206ab4 --- /dev/null +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -0,0 +1,1567 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Userfaultfd unit tests. + * + * Copyright (C) 2015-2023 Red Hat, Inc. + */ + +#include "uffd-common.h" + +#include "../../../../mm/gup_test.h" + +#ifdef __NR_userfaultfd + +/* The unit test doesn't need a large or random size, make it 32MB for now */ +#define UFFD_TEST_MEM_SIZE (32UL << 20) + +#define MEM_ANON BIT_ULL(0) +#define MEM_SHMEM BIT_ULL(1) +#define MEM_SHMEM_PRIVATE BIT_ULL(2) +#define MEM_HUGETLB BIT_ULL(3) +#define MEM_HUGETLB_PRIVATE BIT_ULL(4) + +#define MEM_ALL (MEM_ANON | MEM_SHMEM | MEM_SHMEM_PRIVATE | \ + MEM_HUGETLB | MEM_HUGETLB_PRIVATE) + +#define ALIGN_UP(x, align_to) \ + ((__typeof__(x))((((unsigned long)(x)) + ((align_to)-1)) & ~((align_to)-1))) + +struct mem_type { + const char *name; + unsigned int mem_flag; + uffd_test_ops_t *mem_ops; + bool shared; +}; +typedef struct mem_type mem_type_t; + +mem_type_t mem_types[] = { + { + .name = "anon", + .mem_flag = MEM_ANON, + .mem_ops = &anon_uffd_test_ops, + .shared = false, + }, + { + .name = "shmem", + .mem_flag = MEM_SHMEM, + .mem_ops = &shmem_uffd_test_ops, + .shared = true, + }, + { + .name = "shmem-private", + .mem_flag = MEM_SHMEM_PRIVATE, + .mem_ops = &shmem_uffd_test_ops, + .shared = false, + }, + { + .name = "hugetlb", + .mem_flag = MEM_HUGETLB, + .mem_ops = &hugetlb_uffd_test_ops, + .shared = true, + }, + { + .name = "hugetlb-private", + .mem_flag = MEM_HUGETLB_PRIVATE, + .mem_ops = &hugetlb_uffd_test_ops, + .shared = false, + }, +}; + +/* Arguments to be passed over to each uffd unit test */ +struct uffd_test_args { + mem_type_t *mem_type; +}; +typedef struct uffd_test_args uffd_test_args_t; + +/* Returns: UFFD_TEST_* */ +typedef void (*uffd_test_fn)(uffd_test_args_t *); + +typedef struct { + const char *name; + uffd_test_fn uffd_fn; + unsigned int mem_targets; + uint64_t uffd_feature_required; + uffd_test_case_ops_t *test_case_ops; +} uffd_test_case_t; + +static void uffd_test_report(void) +{ + printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n", + ksft_get_pass_cnt(), + ksft_get_xskip_cnt(), + ksft_get_fail_cnt(), + ksft_test_num()); +} + +static void uffd_test_pass(void) +{ + printf("done\n"); + ksft_inc_pass_cnt(); +} + +#define uffd_test_start(...) do { \ + printf("Testing "); \ + printf(__VA_ARGS__); \ + printf("... "); \ + fflush(stdout); \ + } while (0) + +#define uffd_test_fail(...) do { \ + printf("failed [reason: "); \ + printf(__VA_ARGS__); \ + printf("]\n"); \ + ksft_inc_fail_cnt(); \ + } while (0) + +static void uffd_test_skip(const char *message) +{ + printf("skipped [reason: %s]\n", message); + ksft_inc_xskip_cnt(); +} + +/* + * Returns 1 if specific userfaultfd supported, 0 otherwise. Note, we'll + * return 1 even if some test failed as long as uffd supported, because in + * that case we still want to proceed with the rest uffd unit tests. + */ +static int test_uffd_api(bool use_dev) +{ + struct uffdio_api uffdio_api; + int uffd; + + uffd_test_start("UFFDIO_API (with %s)", + use_dev ? "/dev/userfaultfd" : "syscall"); + + if (use_dev) + uffd = uffd_open_dev(UFFD_FLAGS); + else + uffd = uffd_open_sys(UFFD_FLAGS); + if (uffd < 0) { + uffd_test_skip("cannot open userfaultfd handle"); + return 0; + } + + /* Test wrong UFFD_API */ + uffdio_api.api = 0xab; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) { + uffd_test_fail("UFFDIO_API should fail with wrong api but didn't"); + goto out; + } + + /* Test wrong feature bit */ + uffdio_api.api = UFFD_API; + uffdio_api.features = BIT_ULL(63); + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) { + uffd_test_fail("UFFDIO_API should fail with wrong feature but didn't"); + goto out; + } + + /* Test normal UFFDIO_API */ + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + uffd_test_fail("UFFDIO_API should succeed but failed"); + goto out; + } + + /* Test double requests of UFFDIO_API with a random feature set */ + uffdio_api.features = BIT_ULL(0); + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) { + uffd_test_fail("UFFDIO_API should reject initialized uffd"); + goto out; + } + + uffd_test_pass(); +out: + close(uffd); + /* We have a valid uffd handle */ + return 1; +} + +/* + * This function initializes the global variables. TODO: remove global + * vars and then remove this. + */ +static int +uffd_setup_environment(uffd_test_args_t *args, uffd_test_case_t *test, + mem_type_t *mem_type, const char **errmsg) +{ + map_shared = mem_type->shared; + uffd_test_ops = mem_type->mem_ops; + uffd_test_case_ops = test->test_case_ops; + + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) + page_size = default_huge_page_size(); + else + page_size = psize(); + + nr_pages = UFFD_TEST_MEM_SIZE / page_size; + /* TODO: remove this global var.. it's so ugly */ + nr_cpus = 1; + + /* Initialize test arguments */ + args->mem_type = mem_type; + + return uffd_test_ctx_init(test->uffd_feature_required, errmsg); +} + +static bool uffd_feature_supported(uffd_test_case_t *test) +{ + uint64_t features; + + if (uffd_get_features(&features)) + return false; + + return (features & test->uffd_feature_required) == + test->uffd_feature_required; +} + +static int pagemap_open(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + err("open pagemap"); + + return fd; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +typedef struct { + int parent_uffd, child_uffd; +} fork_event_args; + +static void *fork_event_consumer(void *data) +{ + fork_event_args *args = data; + struct uffd_msg msg = { 0 }; + + /* Read until a full msg received */ + while (uffd_read_msg(args->parent_uffd, &msg)); + + if (msg.event != UFFD_EVENT_FORK) + err("wrong message: %u\n", msg.event); + + /* Just to be properly freed later */ + args->child_uffd = msg.arg.fork.ufd; + return NULL; +} + +typedef struct { + int gup_fd; + bool pinned; +} pin_args; + +/* + * Returns 0 if succeed, <0 for errors. pin_pages() needs to be paired + * with unpin_pages(). Currently it needs to be RO longterm pin to satisfy + * all needs of the test cases (e.g., trigger unshare, trigger fork() early + * CoW, etc.). + */ +static int pin_pages(pin_args *args, void *buffer, size_t size) +{ + struct pin_longterm_test test = { + .addr = (uintptr_t)buffer, + .size = size, + /* Read-only pins */ + .flags = 0, + }; + + if (args->pinned) + err("already pinned"); + + args->gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (args->gup_fd < 0) + return -errno; + + if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_START, &test)) { + /* Even if gup_test existed, can be an old gup_test / kernel */ + close(args->gup_fd); + return -errno; + } + args->pinned = true; + return 0; +} + +static void unpin_pages(pin_args *args) +{ + if (!args->pinned) + err("unpin without pin first"); + if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_STOP)) + err("PIN_LONGTERM_TEST_STOP"); + close(args->gup_fd); + args->pinned = false; +} + +static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) +{ + fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 }; + pthread_t thread; + pid_t child; + uint64_t value; + int fd, result; + + /* Prepare a thread to resolve EVENT_FORK */ + if (with_event) { + if (pthread_create(&thread, NULL, fork_event_consumer, &args)) + err("pthread_create()"); + } + + child = fork(); + if (!child) { + /* Open the pagemap fd of the child itself */ + pin_args args = {}; + + fd = pagemap_open(); + + if (test_pin && pin_pages(&args, area_dst, page_size)) + /* + * Normally when reach here we have pinned in + * previous tests, so shouldn't fail anymore + */ + err("pin page failed in child"); + + value = pagemap_get_entry(fd, area_dst); + /* + * After fork(), we should handle uffd-wp bit differently: + * + * (1) when with EVENT_FORK, it should persist + * (2) when without EVENT_FORK, it should be dropped + */ + pagemap_check_wp(value, with_event); + if (test_pin) + unpin_pages(&args); + /* Succeed */ + exit(0); + } + waitpid(child, &result, 0); + + if (with_event) { + if (pthread_join(thread, NULL)) + err("pthread_join()"); + if (args.child_uffd < 0) + err("Didn't receive child uffd"); + close(args.child_uffd); + } + + return result; +} + +static void uffd_wp_unpopulated_test(uffd_test_args_t *args) +{ + uint64_t value; + int pagemap_fd; + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + false, true, false)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Test applying pte marker to anon unpopulated */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + + /* Test unprotect on anon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test zap on anon marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Test fault in after marker removed */ + *area_dst = 1; + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + + /* Test read-zero-page upon pte marker */ + wp_range(uffd, (uint64_t)area_dst, page_size, true); + *(volatile char *)area_dst; + /* Drop it to make pte none again */ + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + + uffd_test_pass(); +} + +static void uffd_wp_fork_test_common(uffd_test_args_t *args, + bool with_event) +{ + int pagemap_fd; + uint64_t value; + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + false, true, false)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, page_size, true); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + if (pagemap_test_fork(uffd, with_event, false)) { + uffd_test_fail("Detected %s uffd-wp bit in child in present pte", + with_event ? "missing" : "stall"); + goto out; + } + + /* + * This is an attempt for zapping the pgtable so as to test the + * markers. + * + * For private mappings, PAGEOUT will only work on exclusive ptes + * (PM_MMAP_EXCLUSIVE) which we should satisfy. + * + * For shared, PAGEOUT may not work. Use DONTNEED instead which + * plays a similar role of zapping (rather than freeing the page) + * to expose pte markers. + */ + if (args->mem_type->shared) { + if (madvise(area_dst, page_size, MADV_DONTNEED)) + err("MADV_DONTNEED"); + } else { + /* + * NOTE: ignore retval because private-hugetlb doesn't yet + * support swapping, so it could fail. + */ + madvise(area_dst, page_size, MADV_PAGEOUT); + } + + /* Uffd-wp should persist even swapped out */ + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + if (pagemap_test_fork(uffd, with_event, false)) { + uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte", + with_event ? "missing" : "stall"); + goto out; + } + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_get_entry(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + uffd_test_pass(); +out: + if (uffd_unregister(uffd, area_dst, nr_pages * page_size)) + err("unregister failed"); + close(pagemap_fd); +} + +static void uffd_wp_fork_test(uffd_test_args_t *args) +{ + uffd_wp_fork_test_common(args, false); +} + +static void uffd_wp_fork_with_event_test(uffd_test_args_t *args) +{ + uffd_wp_fork_test_common(args, true); +} + +static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args, + bool with_event) +{ + int pagemap_fd; + pin_args pin_args = {}; + + if (uffd_register(uffd, area_dst, page_size, false, true, false)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, page_size, true); + + /* + * 1. First pin, then fork(). This tests fork() special path when + * doing early CoW if the page is private. + */ + if (pin_pages(&pin_args, area_dst, page_size)) { + uffd_test_skip("Possibly CONFIG_GUP_TEST missing " + "or unprivileged"); + close(pagemap_fd); + uffd_unregister(uffd, area_dst, page_size); + return; + } + + if (pagemap_test_fork(uffd, with_event, false)) { + uffd_test_fail("Detected %s uffd-wp bit in early CoW of fork()", + with_event ? "missing" : "stall"); + unpin_pages(&pin_args); + goto out; + } + + unpin_pages(&pin_args); + + /* + * 2. First fork(), then pin (in the child, where test_pin==true). + * This tests COR, aka, page unsharing on private memories. + */ + if (pagemap_test_fork(uffd, with_event, true)) { + uffd_test_fail("Detected %s uffd-wp bit when RO pin", + with_event ? "missing" : "stall"); + goto out; + } + uffd_test_pass(); +out: + if (uffd_unregister(uffd, area_dst, page_size)) + err("register failed"); + close(pagemap_fd); +} + +static void uffd_wp_fork_pin_test(uffd_test_args_t *args) +{ + uffd_wp_fork_pin_test_common(args, false); +} + +static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args) +{ + uffd_wp_fork_pin_test_common(args, true); +} + +static void check_memory_contents(char *p) +{ + unsigned long i, j; + uint8_t expected_byte; + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + for (j = 0; j < page_size; j++) { + uint8_t v = *(uint8_t *)(p + (i * page_size) + j); + if (v != expected_byte) + err("unexpected page contents"); + } + } +} + +static void uffd_minor_test_common(bool test_collapse, bool test_wp) +{ + unsigned long p; + pthread_t uffd_mon; + char c; + struct uffd_args args = { 0 }; + + /* + * NOTE: MADV_COLLAPSE is not yet compatible with WP, so testing + * both do not make much sense. + */ + assert(!(test_collapse && test_wp)); + + if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, + /* NOTE! MADV_COLLAPSE may not work with uffd-wp */ + false, test_wp, true)) + err("register failure"); + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + + args.apply_wp = test_wp; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + check_memory_contents(area_dst_alias); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("join() failed"); + + if (test_collapse) { + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) { + /* It's fine to fail for this one... */ + uffd_test_skip("MADV_COLLAPSE failed"); + return; + } + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + read_pmd_pagesize()); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + } + + if (args.missing_faults != 0 || args.minor_faults != nr_pages) + uffd_test_fail("stats check error"); + else + uffd_test_pass(); +} + +void uffd_minor_test(uffd_test_args_t *args) +{ + uffd_minor_test_common(false, false); +} + +void uffd_minor_wp_test(uffd_test_args_t *args) +{ + uffd_minor_test_common(false, true); +} + +void uffd_minor_collapse_test(uffd_test_args_t *args) +{ + uffd_minor_test_common(true, false); +} + +static sigjmp_buf jbuf, *sigbuf; + +static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +{ + if (sig == SIGBUS) { + if (sigbuf) + siglongjmp(*sigbuf, 1); + abort(); + } +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event for shmem and + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked + * for hugetlb. + * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register + * monitored area, generate pagefaults and test that signal is delivered. + * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 + * test robustness use case - we release monitored area, fork a process + * that will generate pagefaults and verify signal is generated. + * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal + * feature. Using monitor thread, verify no userfault events are generated. + */ +static int faulting_process(int signal_test, bool wp) +{ + unsigned long nr, i; + unsigned long long count; + unsigned long split_nr_pages; + unsigned long lastnr; + struct sigaction act; + volatile unsigned long signalled = 0; + + split_nr_pages = (nr_pages + 1) / 2; + + if (signal_test) { + sigbuf = &jbuf; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + lastnr = (unsigned long)-1; + } + + for (nr = 0; nr < split_nr_pages; nr++) { + volatile int steps = 1; + unsigned long offset = nr * page_size; + + if (signal_test) { + if (sigsetjmp(*sigbuf, 1) != 0) { + if (steps == 1 && nr == lastnr) + err("Signal repeated"); + + lastnr = nr; + if (signal_test == 1) { + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset, wp)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } + } else { + signalled++; + continue; + } + } + } + + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + if (signal_test) + return signalled != split_nr_pages; + + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + } + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + uffd_test_ops->release_pages(area_dst); + + for (nr = 0; nr < nr_pages; nr++) + for (i = 0; i < page_size; i++) + if (*(area_dst + nr * page_size + i) != 0) + err("page %lu offset %lu is not zero", nr, i); + + return 0; +} + +static void uffd_sigbus_test_common(bool wp) +{ + unsigned long userfaults; + pthread_t uffd_mon; + pid_t pid; + int err; + char c; + struct uffd_args args = { 0 }; + + ready_for_fork = false; + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, wp, false)) + err("register failure"); + + if (faulting_process(1, wp)) + err("faulting process failed"); + + uffd_test_ops->release_pages(area_dst); + + args.apply_wp = wp; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(2, wp)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, (void **)&userfaults)) + err("pthread_join()"); + + if (userfaults) + uffd_test_fail("Signal test failed, userfaults: %ld", userfaults); + else + uffd_test_pass(); +} + +static void uffd_sigbus_test(uffd_test_args_t *args) +{ + uffd_sigbus_test_common(false); +} + +static void uffd_sigbus_wp_test(uffd_test_args_t *args) +{ + uffd_sigbus_test_common(true); +} + +static void uffd_events_test_common(bool wp) +{ + pthread_t uffd_mon; + pid_t pid; + int err; + char c; + struct uffd_args args = { 0 }; + + ready_for_fork = false; + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, wp, false)) + err("register failure"); + + args.apply_wp = wp; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(0, wp)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("pthread_join()"); + + if (args.missing_faults != nr_pages) + uffd_test_fail("Fault counts wrong"); + else + uffd_test_pass(); +} + +static void uffd_events_test(uffd_test_args_t *args) +{ + uffd_events_test_common(false); +} + +static void uffd_events_wp_test(uffd_test_args_t *args) +{ + uffd_events_test_common(true); +} + +static void retry_uffdio_zeropage(int ufd, + struct uffdio_zeropage *uffdio_zeropage) +{ + uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffdio_zeropage->range.len, + 0); + if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } else { + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } +} + +static bool do_uffdio_zeropage(int ufd, bool has_zeropage) +{ + struct uffdio_zeropage uffdio_zeropage = { 0 }; + int ret; + __s64 res; + + uffdio_zeropage.range.start = (unsigned long) area_dst; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); + } else if (has_zeropage) { + if (res != page_size) + err("UFFDIO_ZEROPAGE unexpected size"); + else + retry_uffdio_zeropage(ufd, &uffdio_zeropage); + return true; + } else + err("UFFDIO_ZEROPAGE succeeded"); + + return false; +} + +/* + * Registers a range with MISSING mode only for zeropage test. Return true + * if UFFDIO_ZEROPAGE supported, false otherwise. Can't use uffd_register() + * because we want to detect .ioctls along the way. + */ +static bool +uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len) +{ + uint64_t ioctls = 0; + + if (uffd_register_with_ioctls(uffd, addr, len, true, + false, false, &ioctls)) + err("zeropage register fail"); + + return ioctls & (1 << _UFFDIO_ZEROPAGE); +} + +/* exercise UFFDIO_ZEROPAGE */ +static void uffd_zeropage_test(uffd_test_args_t *args) +{ + bool has_zeropage; + int i; + + has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size); + if (area_dst_alias) + /* Ignore the retval; we already have it */ + uffd_register_detect_zeropage(uffd, area_dst_alias, page_size); + + if (do_uffdio_zeropage(uffd, has_zeropage)) + for (i = 0; i < page_size; i++) + if (area_dst[i] != 0) + err("data non-zero at offset %d\n", i); + + if (uffd_unregister(uffd, area_dst, page_size)) + err("unregister"); + + if (area_dst_alias && uffd_unregister(uffd, area_dst_alias, page_size)) + err("unregister"); + + uffd_test_pass(); +} + +static void uffd_register_poison(int uffd, void *addr, uint64_t len) +{ + uint64_t ioctls = 0; + uint64_t expected = (1 << _UFFDIO_COPY) | (1 << _UFFDIO_POISON); + + if (uffd_register_with_ioctls(uffd, addr, len, true, + false, false, &ioctls)) + err("poison register fail"); + + if ((ioctls & expected) != expected) + err("registered area doesn't support COPY and POISON ioctls"); +} + +static void do_uffdio_poison(int uffd, unsigned long offset) +{ + struct uffdio_poison uffdio_poison = { 0 }; + int ret; + __s64 res; + + uffdio_poison.range.start = (unsigned long) area_dst + offset; + uffdio_poison.range.len = page_size; + uffdio_poison.mode = 0; + ret = ioctl(uffd, UFFDIO_POISON, &uffdio_poison); + res = uffdio_poison.updated; + + if (ret) + err("UFFDIO_POISON error: %"PRId64, (int64_t)res); + else if (res != page_size) + err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res); +} + +static void uffd_poison_handle_fault( + struct uffd_msg *msg, struct uffd_args *args) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & + (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR)) + err("unexpected fault type %llu", msg->arg.pagefault.flags); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + /* Odd pages -> copy zeroed page; even pages -> poison. */ + if (offset & page_size) + copy_page(uffd, offset, false); + else + do_uffdio_poison(uffd, offset); +} + +static void uffd_poison_test(uffd_test_args_t *targs) +{ + pthread_t uffd_mon; + char c; + struct uffd_args args = { 0 }; + struct sigaction act = { 0 }; + unsigned long nr_sigbus = 0; + unsigned long nr; + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffd_register_poison(uffd, area_dst, nr_pages * page_size); + memset(area_src, 0, nr_pages * page_size); + + args.handle_fault = uffd_poison_handle_fault; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + sigbuf = &jbuf; + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + + for (nr = 0; nr < nr_pages; ++nr) { + unsigned long offset = nr * page_size; + const char *bytes = (const char *) area_dst + offset; + const char *i; + + if (sigsetjmp(*sigbuf, 1)) { + /* + * Access below triggered a SIGBUS, which was caught by + * sighndl, which then jumped here. Count this SIGBUS, + * and move on to next page. + */ + ++nr_sigbus; + continue; + } + + for (i = bytes; i < bytes + page_size; ++i) { + if (*i) + err("nonzero byte in area_dst (%p) at %p: %u", + area_dst, i, *i); + } + } + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("pthread_join()"); + + if (nr_sigbus != nr_pages / 2) + err("expected to receive %lu SIGBUS, actually received %lu", + nr_pages / 2, nr_sigbus); + + uffd_test_pass(); +} + +static void +uffd_move_handle_fault_common(struct uffd_msg *msg, struct uffd_args *args, + unsigned long len) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & + (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR | UFFD_PAGEFAULT_FLAG_WRITE)) + err("unexpected fault type %llu", msg->arg.pagefault.flags); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(len-1); + + if (move_page(uffd, offset, len)) + args->missing_faults++; +} + +static void uffd_move_handle_fault(struct uffd_msg *msg, + struct uffd_args *args) +{ + uffd_move_handle_fault_common(msg, args, page_size); +} + +static void uffd_move_pmd_handle_fault(struct uffd_msg *msg, + struct uffd_args *args) +{ + uffd_move_handle_fault_common(msg, args, read_pmd_pagesize()); +} + +static void +uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, + void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args)) +{ + unsigned long nr; + pthread_t uffd_mon; + char c; + unsigned long long count; + struct uffd_args args = { 0 }; + char *orig_area_src, *orig_area_dst; + unsigned long step_size, step_count; + unsigned long src_offs = 0; + unsigned long dst_offs = 0; + + /* Prevent source pages from being mapped more than once */ + if (madvise(area_src, nr_pages * page_size, MADV_DONTFORK)) + err("madvise(MADV_DONTFORK) failure"); + + if (uffd_register(uffd, area_dst, nr_pages * page_size, + true, false, false)) + err("register failure"); + + args.handle_fault = handle_fault; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + step_size = chunk_size / page_size; + step_count = nr_pages / step_size; + + if (chunk_size > page_size) { + char *aligned_src = ALIGN_UP(area_src, chunk_size); + char *aligned_dst = ALIGN_UP(area_dst, chunk_size); + + if (aligned_src != area_src || aligned_dst != area_dst) { + src_offs = (aligned_src - area_src) / page_size; + dst_offs = (aligned_dst - area_dst) / page_size; + step_count--; + } + orig_area_src = area_src; + orig_area_dst = area_dst; + area_src = aligned_src; + area_dst = aligned_dst; + } + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a missing + * fault. uffd_poll_thread will resolve the fault by moving source + * page to destination. + */ + for (nr = 0; nr < step_count * step_size; nr += step_size) { + unsigned long i; + + /* Check area_src content */ + for (i = 0; i < step_size; i++) { + count = *area_count(area_src, nr + i); + if (count != count_verify[src_offs + nr + i]) + err("nr %lu source memory invalid %llu %llu\n", + nr + i, count, count_verify[src_offs + nr + i]); + } + + /* Faulting into area_dst should move the page or the huge page */ + for (i = 0; i < step_size; i++) { + count = *area_count(area_dst, nr + i); + if (count != count_verify[dst_offs + nr + i]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[dst_offs + nr + i]); + } + + /* Re-check area_src content which should be empty */ + for (i = 0; i < step_size; i++) { + count = *area_count(area_src, nr + i); + if (count != 0) + err("nr %lu move failed %llu %llu\n", + nr, count, count_verify[src_offs + nr + i]); + } + } + if (step_size > page_size) { + area_src = orig_area_src; + area_dst = orig_area_dst; + } + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("join() failed"); + + if (args.missing_faults != step_count || args.minor_faults != 0) + uffd_test_fail("stats check error"); + else + uffd_test_pass(); +} + +static void uffd_move_test(uffd_test_args_t *targs) +{ + uffd_move_test_common(targs, page_size, uffd_move_handle_fault); +} + +static void uffd_move_pmd_test(uffd_test_args_t *targs) +{ + if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + err("madvise(MADV_HUGEPAGE) failure"); + uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_pmd_handle_fault); +} + +static void uffd_move_pmd_split_test(uffd_test_args_t *targs) +{ + if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + err("madvise(MADV_NOHUGEPAGE) failure"); + uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_pmd_handle_fault); +} + +static int prevent_hugepages(const char **errmsg) +{ + /* This should be done before source area is populated */ + if (madvise(area_src, nr_pages * page_size, MADV_NOHUGEPAGE)) { + /* Ignore only if CONFIG_TRANSPARENT_HUGEPAGE=n */ + if (errno != EINVAL) { + if (errmsg) + *errmsg = "madvise(MADV_NOHUGEPAGE) failed"; + return -errno; + } + } + return 0; +} + +static int request_hugepages(const char **errmsg) +{ + /* This should be done before source area is populated */ + if (madvise(area_src, nr_pages * page_size, MADV_HUGEPAGE)) { + if (errmsg) { + *errmsg = (errno == EINVAL) ? + "CONFIG_TRANSPARENT_HUGEPAGE is not set" : + "madvise(MADV_HUGEPAGE) failed"; + } + return -errno; + } + return 0; +} + +struct uffd_test_case_ops uffd_move_test_case_ops = { + .post_alloc = prevent_hugepages, +}; + +struct uffd_test_case_ops uffd_move_test_pmd_case_ops = { + .post_alloc = request_hugepages, +}; + +/* + * Test the returned uffdio_register.ioctls with different register modes. + * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test. + */ +static void +do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor) +{ + uint64_t ioctls = 0, expected = BIT_ULL(_UFFDIO_WAKE); + mem_type_t *mem_type = args->mem_type; + int ret; + + ret = uffd_register_with_ioctls(uffd, area_dst, page_size, + miss, wp, minor, &ioctls); + + /* + * Handle special cases of UFFDIO_REGISTER here where it should + * just fail with -EINVAL first.. + * + * Case 1: register MINOR on anon + * Case 2: register with no mode selected + */ + if ((minor && (mem_type->mem_flag == MEM_ANON)) || + (!miss && !wp && !minor)) { + if (ret != -EINVAL) + err("register (miss=%d, wp=%d, minor=%d) failed " + "with wrong errno=%d", miss, wp, minor, ret); + return; + } + + /* UFFDIO_REGISTER should succeed, then check ioctls returned */ + if (miss) + expected |= BIT_ULL(_UFFDIO_COPY); + if (wp) + expected |= BIT_ULL(_UFFDIO_WRITEPROTECT); + if (minor) + expected |= BIT_ULL(_UFFDIO_CONTINUE); + + if ((ioctls & expected) != expected) + err("unexpected uffdio_register.ioctls " + "(miss=%d, wp=%d, minor=%d): expected=0x%"PRIx64", " + "returned=0x%"PRIx64, miss, wp, minor, expected, ioctls); + + if (uffd_unregister(uffd, area_dst, page_size)) + err("unregister"); +} + +static void uffd_register_ioctls_test(uffd_test_args_t *args) +{ + int miss, wp, minor; + + for (miss = 0; miss <= 1; miss++) + for (wp = 0; wp <= 1; wp++) + for (minor = 0; minor <= 1; minor++) + do_register_ioctls_test(args, miss, wp, minor); + + uffd_test_pass(); +} + +uffd_test_case_t uffd_tests[] = { + { + /* Test returned uffdio_register.ioctls. */ + .name = "register-ioctls", + .uffd_fn = uffd_register_ioctls_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_MISSING_HUGETLBFS | + UFFD_FEATURE_MISSING_SHMEM | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | + UFFD_FEATURE_MINOR_HUGETLBFS | + UFFD_FEATURE_MINOR_SHMEM, + }, + { + .name = "zeropage", + .uffd_fn = uffd_zeropage_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = 0, + }, + { + .name = "move", + .uffd_fn = uffd_move_test, + .mem_targets = MEM_ANON, + .uffd_feature_required = UFFD_FEATURE_MOVE, + .test_case_ops = &uffd_move_test_case_ops, + }, + { + .name = "move-pmd", + .uffd_fn = uffd_move_pmd_test, + .mem_targets = MEM_ANON, + .uffd_feature_required = UFFD_FEATURE_MOVE, + .test_case_ops = &uffd_move_test_pmd_case_ops, + }, + { + .name = "move-pmd-split", + .uffd_fn = uffd_move_pmd_split_test, + .mem_targets = MEM_ANON, + .uffd_feature_required = UFFD_FEATURE_MOVE, + .test_case_ops = &uffd_move_test_pmd_case_ops, + }, + { + .name = "wp-fork", + .uffd_fn = uffd_wp_fork_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, + { + .name = "wp-fork-with-event", + .uffd_fn = uffd_wp_fork_with_event_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | + /* when set, child process should inherit uffd-wp bits */ + UFFD_FEATURE_EVENT_FORK, + }, + { + .name = "wp-fork-pin", + .uffd_fn = uffd_wp_fork_pin_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, + { + .name = "wp-fork-pin-with-event", + .uffd_fn = uffd_wp_fork_pin_with_event_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | + /* when set, child process should inherit uffd-wp bits */ + UFFD_FEATURE_EVENT_FORK, + }, + { + .name = "wp-unpopulated", + .uffd_fn = uffd_wp_unpopulated_test, + .mem_targets = MEM_ANON, + .uffd_feature_required = + UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_UNPOPULATED, + }, + { + .name = "minor", + .uffd_fn = uffd_minor_test, + .mem_targets = MEM_SHMEM | MEM_HUGETLB, + .uffd_feature_required = + UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM, + }, + { + .name = "minor-wp", + .uffd_fn = uffd_minor_wp_test, + .mem_targets = MEM_SHMEM | MEM_HUGETLB, + .uffd_feature_required = + UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + /* + * HACK: here we leveraged WP_UNPOPULATED to detect whether + * minor mode supports wr-protect. There's no feature flag + * for it so this is the best we can test against. + */ + UFFD_FEATURE_WP_UNPOPULATED, + }, + { + .name = "minor-collapse", + .uffd_fn = uffd_minor_collapse_test, + /* MADV_COLLAPSE only works with shmem */ + .mem_targets = MEM_SHMEM, + /* We can't test MADV_COLLAPSE, so try our luck */ + .uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM, + }, + { + .name = "sigbus", + .uffd_fn = uffd_sigbus_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_SIGBUS | + UFFD_FEATURE_EVENT_FORK, + }, + { + .name = "sigbus-wp", + .uffd_fn = uffd_sigbus_wp_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_SIGBUS | + UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, + { + .name = "events", + .uffd_fn = uffd_events_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE, + }, + { + .name = "events-wp", + .uffd_fn = uffd_events_wp_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_EVENT_FORK | + UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, + }, + { + .name = "poison", + .uffd_fn = uffd_poison_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_POISON, + }, +}; + +static void usage(const char *prog) +{ + printf("usage: %s [-f TESTNAME]\n", prog); + puts(""); + puts(" -f: test name to filter (e.g., event)"); + puts(" -h: show the help msg"); + puts(" -l: list tests only"); + puts(""); + exit(KSFT_FAIL); +} + +int main(int argc, char *argv[]) +{ + int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t); + int n_mems = sizeof(mem_types) / sizeof(mem_type_t); + const char *test_filter = NULL; + bool list_only = false; + uffd_test_case_t *test; + mem_type_t *mem_type; + uffd_test_args_t args; + const char *errmsg; + int has_uffd, opt; + int i, j; + + while ((opt = getopt(argc, argv, "f:hl")) != -1) { + switch (opt) { + case 'f': + test_filter = optarg; + break; + case 'l': + list_only = true; + break; + case 'h': + default: + /* Unknown */ + usage(argv[0]); + break; + } + } + + if (!test_filter && !list_only) { + has_uffd = test_uffd_api(false); + has_uffd |= test_uffd_api(true); + + if (!has_uffd) { + printf("Userfaultfd not supported or unprivileged, skip all tests\n"); + exit(KSFT_SKIP); + } + } + + for (i = 0; i < n_tests; i++) { + test = &uffd_tests[i]; + if (test_filter && !strstr(test->name, test_filter)) + continue; + if (list_only) { + printf("%s\n", test->name); + continue; + } + for (j = 0; j < n_mems; j++) { + mem_type = &mem_types[j]; + if (!(test->mem_targets & mem_type->mem_flag)) + continue; + + uffd_test_start("%s on %s", test->name, mem_type->name); + if ((mem_type->mem_flag == MEM_HUGETLB || + mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && + (default_huge_page_size() == 0)) { + uffd_test_skip("huge page size is 0, feature missing?"); + continue; + } + if (!uffd_feature_supported(test)) { + uffd_test_skip("feature missing"); + continue; + } + if (uffd_setup_environment(&args, test, mem_type, + &errmsg)) { + uffd_test_skip(errmsg); + continue; + } + test->uffd_fn(&args); + uffd_test_ctx_clear(); + } + } + + if (!list_only) + uffd_test_report(); + + return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("Skipping %s (missing __NR_userfaultfd)\n", __file__); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c new file mode 100644 index 000000000000..cfbc501290d3 --- /dev/null +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> + * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + */ + +#include <stdio.h> +#include <sys/mman.h> +#include <string.h> + +#include "../kselftest.h" + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#elif __aarch64__ +/* + * The default hugepage size for 64k base pagesize + * is 512MB. + */ +#define PAGE_SIZE (64 << 10) +#define HUGETLB_SIZE (512 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * The hint addr value is used to allocate addresses + * beyond the high address switch boundary. + */ + +#define ADDR_MARK_128TB (1UL << 47) +#define ADDR_MARK_256TB (1UL << 48) + +#define HIGH_ADDR_128TB ((void *) (1UL << 48)) +#define HIGH_ADDR_256TB ((void *) (1UL << 49)) + +#define LOW_ADDR ((void *) (1UL << 30)) + +#ifdef __aarch64__ +#define ADDR_SWITCH_HINT ADDR_MARK_256TB +#define HIGH_ADDR HIGH_ADDR_256TB +#else +#define ADDR_SWITCH_HINT ADDR_MARK_128TB +#define HIGH_ADDR HIGH_ADDR_128TB +#endif + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * Unless MAP_FIXED is specified, allocation based on hint + * addr is never at requested address or above it, which is + * beyond high address switch boundary in this case. Instead, + * a suitable allocation is found in lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at high address switch boundary, should + * be obtained even without MAP_FIXED as area is free. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = KSFT_PASS; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = KSFT_FAIL; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = KSFT_FAIL; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#elif defined(__aarch64__) + return getpagesize() == PAGE_SIZE; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return KSFT_SKIP; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh new file mode 100755 index 000000000000..a0a75f302904 --- /dev/null +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) <adam@wowsignal.io> +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + local cpu_supports_pl5=$(awk '/^flags/ {if (/la57/) {print 0;} + else {print 1}; exit}' /proc/cpuinfo 2>/dev/null) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then + echo "$0: CPU does not have the necessary la57 flag to support page table level 5" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_high_addr_switch + +# In order to run hugetlb testcases, "--run-hugetlb" must be appended +# to the binary. +./va_high_addr_switch --run-hugetlb diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c new file mode 100644 index 000000000000..4e4c1e311247 --- /dev/null +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2017, Anshuman Khandual, IBM Corp. + * + * Works on architectures which support 128TB virtual + * address range and beyond. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <sys/time.h> +#include <fcntl.h> + +#include "../kselftest.h" + +/* + * Maximum address range mapped with a single mmap() + * call is little bit more than 1GB. Hence 1GB is + * chosen as the single chunk size for address space + * mapping. + */ + +#define SZ_1GB (1024 * 1024 * 1024UL) +#define SZ_1TB (1024 * 1024 * 1024 * 1024UL) + +#define MAP_CHUNK_SIZE SZ_1GB + +/* + * Address space till 128TB is mapped without any hint + * and is enabled by default. Address space beyond 128TB + * till 512TB is obtained by passing hint address as the + * first argument into mmap() system call. + * + * The process heap address space is divided into two + * different areas one below 128TB and one above 128TB + * till it reaches 512TB. One with size 128TB and the + * other being 384TB. + * + * On Arm64 the address space is 256TB and support for + * high mappings up to 4PB virtual address space has + * been added. + */ + +#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ +#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) +#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) +#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) + +#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ +#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ + +#ifdef __aarch64__ +#define HIGH_ADDR_MARK ADDR_MARK_256TB +#define HIGH_ADDR_SHIFT 49 +#define NR_CHUNKS_LOW NR_CHUNKS_256TB +#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB +#else +#define HIGH_ADDR_MARK ADDR_MARK_128TB +#define HIGH_ADDR_SHIFT 48 +#define NR_CHUNKS_LOW NR_CHUNKS_128TB +#define NR_CHUNKS_HIGH NR_CHUNKS_384TB +#endif + +static char *hind_addr(void) +{ + int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); + + return (char *) (1UL << bits); +} + +static void validate_addr(char *ptr, int high_addr) +{ + unsigned long addr = (unsigned long) ptr; + + if (high_addr && addr < HIGH_ADDR_MARK) + ksft_exit_fail_msg("Bad address %lx\n", addr); + + if (addr > HIGH_ADDR_MARK) + ksft_exit_fail_msg("Bad address %lx\n", addr); +} + +static int validate_lower_address_hint(void) +{ + char *ptr; + + ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr == MAP_FAILED) + return 0; + + return 1; +} + +static int validate_complete_va_space(void) +{ + unsigned long start_addr, end_addr, prev_end_addr; + char line[400]; + char prot[6]; + FILE *file; + int fd; + + fd = open("va_dump", O_CREAT | O_WRONLY, 0600); + unlink("va_dump"); + if (fd < 0) { + ksft_test_result_skip("cannot create or open dump file\n"); + ksft_finished(); + } + + file = fopen("/proc/self/maps", "r"); + if (file == NULL) + ksft_exit_fail_msg("cannot open /proc/self/maps\n"); + + prev_end_addr = 0; + while (fgets(line, sizeof(line), file)) { + unsigned long hop; + + if (sscanf(line, "%lx-%lx %s[rwxp-]", + &start_addr, &end_addr, prot) != 3) + ksft_exit_fail_msg("cannot parse /proc/self/maps\n"); + + /* end of userspace mappings; ignore vsyscall mapping */ + if (start_addr & (1UL << 63)) + return 0; + + /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */ + if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE) + return 1; + + prev_end_addr = end_addr; + + if (prot[0] != 'r') + continue; + + /* + * Confirm whether MAP_CHUNK_SIZE chunk can be found or not. + * If write succeeds, no need to check MAP_CHUNK_SIZE - 1 + * addresses after that. If the address was not held by this + * process, write would fail with errno set to EFAULT. + * Anyways, if write returns anything apart from 1, exit the + * program since that would mean a bug in /proc/self/maps. + */ + hop = 0; + while (start_addr + hop < end_addr) { + if (write(fd, (void *)(start_addr + hop), 1) != 1) + return 1; + lseek(fd, 0, SEEK_SET); + + hop += MAP_CHUNK_SIZE; + } + } + return 0; +} + +int main(int argc, char *argv[]) +{ + char *ptr[NR_CHUNKS_LOW]; + char **hptr; + char *hint; + unsigned long i, lchunks, hchunks; + + ksft_print_header(); + ksft_set_plan(1); + + for (i = 0; i < NR_CHUNKS_LOW; i++) { + ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr[i] == MAP_FAILED) { + if (validate_lower_address_hint()) + ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n"); + break; + } + + validate_addr(ptr[i], 0); + } + lchunks = i; + hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); + if (hptr == NULL) { + ksft_test_result_skip("Memory constraint not fulfilled\n"); + ksft_finished(); + } + + for (i = 0; i < NR_CHUNKS_HIGH; i++) { + hint = hind_addr(); + hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (hptr[i] == MAP_FAILED) + break; + + validate_addr(hptr[i], 1); + } + hchunks = i; + if (validate_complete_va_space()) { + ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n"); + ksft_finished(); + } + + for (i = 0; i < lchunks; i++) + munmap(ptr[i], MAP_CHUNK_SIZE); + + for (i = 0; i < hchunks; i++) + munmap(hptr[i], MAP_CHUNK_SIZE); + + free(hptr); + + ksft_test_result_pass("Test\n"); + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c new file mode 100644 index 000000000000..5a62530da3b5 --- /dev/null +++ b/tools/testing/selftests/mm/vm_util.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <string.h> +#include <fcntl.h> +#include <dirent.h> +#include <sys/ioctl.h> +#include <linux/userfaultfd.h> +#include <linux/fs.h> +#include <sys/syscall.h> +#include <unistd.h> +#include "../kselftest.h" +#include "vm_util.h" + +#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SMAP_FILE_PATH "/proc/self/smaps" +#define MAX_LINE_LENGTH 500 + +unsigned int __page_size; +unsigned int __page_shift; + +uint64_t pagemap_get_entry(int fd, char *start) +{ + const unsigned long pfn = (unsigned long)start / getpagesize(); + uint64_t entry; + int ret; + + ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); + if (ret != sizeof(entry)) + ksft_exit_fail_msg("reading pagemap failed\n"); + return entry; +} + +static uint64_t __pagemap_scan_get_categories(int fd, char *start, struct page_region *r) +{ + struct pm_scan_arg arg; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + psize()); + arg.vec = (uintptr_t)r; + arg.vec_len = 1; + arg.flags = 0; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = 0; + arg.category_inverted = 0; + arg.category_mask = 0; + arg.category_anyof_mask = PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | PAGE_IS_FILE | + PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY; + arg.return_mask = arg.category_anyof_mask; + + return ioctl(fd, PAGEMAP_SCAN, &arg); +} + +static uint64_t pagemap_scan_get_categories(int fd, char *start) +{ + struct page_region r; + long ret; + + ret = __pagemap_scan_get_categories(fd, start, &r); + if (ret < 0) + ksft_exit_fail_msg("PAGEMAP_SCAN failed: %s\n", strerror(errno)); + if (ret == 0) + return 0; + return r.categories; +} + +/* `start` is any valid address. */ +static bool pagemap_scan_supported(int fd, char *start) +{ + static int supported = -1; + int ret; + + if (supported != -1) + return supported; + + /* Provide an invalid address in order to trigger EFAULT. */ + ret = __pagemap_scan_get_categories(fd, start, (struct page_region *) ~0UL); + if (ret == 0) + ksft_exit_fail_msg("PAGEMAP_SCAN succeeded unexpectedly\n"); + + supported = errno == EFAULT; + + return supported; +} + +static bool page_entry_is(int fd, char *start, char *desc, + uint64_t pagemap_flags, uint64_t pagescan_flags) +{ + bool m = pagemap_get_entry(fd, start) & pagemap_flags; + + if (pagemap_scan_supported(fd, start)) { + bool s = pagemap_scan_get_categories(fd, start) & pagescan_flags; + + if (m == s) + return m; + + ksft_exit_fail_msg( + "read and ioctl return unmatched results for %s: %d %d", desc, m, s); + } + return m; +} + +bool pagemap_is_softdirty(int fd, char *start) +{ + return page_entry_is(fd, start, "soft-dirty", + PM_SOFT_DIRTY, PAGE_IS_SOFT_DIRTY); +} + +bool pagemap_is_swapped(int fd, char *start) +{ + return page_entry_is(fd, start, "swap", PM_SWAP, PAGE_IS_SWAPPED); +} + +bool pagemap_is_populated(int fd, char *start) +{ + return page_entry_is(fd, start, "populated", + PM_PRESENT | PM_SWAP, + PAGE_IS_PRESENT | PAGE_IS_SWAPPED); +} + +unsigned long pagemap_get_pfn(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* If present (63th bit), PFN is at bit 0 -- 54. */ + if (entry & PM_PRESENT) + return entry & 0x007fffffffffffffull; + return -1ul; +} + +void clear_softdirty(void) +{ + int ret; + const char *ctrl = "4"; + int fd = open("/proc/self/clear_refs", O_WRONLY); + + if (fd < 0) + ksft_exit_fail_msg("opening clear_refs failed\n"); + ret = write(fd, ctrl, strlen(ctrl)); + close(fd); + if (ret != strlen(ctrl)) + ksft_exit_fail_msg("writing clear_refs failed\n"); +} + +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) +{ + while (fgets(buf, len, fp)) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); + if (fd == -1) + return 0; + + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + return 0; + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) +{ + uint64_t thp = -1; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); + + fp = fopen(SMAP_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); + + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + /* + * Fetch the pattern in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) + goto err_out; + + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) + ksft_exit_fail_msg("Reading smap error\n"); + +err_out: + fclose(fp); + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); +} + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} + +int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + ksft_exit_fail_msg("mmap transhuge\n"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + ksft_exit_fail_msg("MADV_HUGEPAGE\n"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent)) + ksft_exit_fail_msg("read pagemap\n"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - pshift())) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +int detect_hugetlb_page_sizes(size_t sizes[], int max) +{ + DIR *dir = opendir("/sys/kernel/mm/hugepages/"); + int count = 0; + + if (!dir) + return 0; + + while (count < max) { + struct dirent *entry = readdir(dir); + size_t kb; + + if (!entry) + break; + if (entry->d_type != DT_DIR) + continue; + if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) + continue; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected hugetlb page size: %zu KiB\n", + kb); + } + closedir(dir); + return count; +} + +/* If `ioctls' non-NULL, the allowed ioctls will be returned into the var */ +int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor, uint64_t *ioctls) +{ + struct uffdio_register uffdio_register = { 0 }; + uint64_t mode = 0; + int ret = 0; + + if (miss) + mode |= UFFDIO_REGISTER_MODE_MISSING; + if (wp) + mode |= UFFDIO_REGISTER_MODE_WP; + if (minor) + mode |= UFFDIO_REGISTER_MODE_MINOR; + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = mode; + + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) + ret = -errno; + else if (ioctls) + *ioctls = uffdio_register.ioctls; + + return ret; +} + +int uffd_register(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor) +{ + return uffd_register_with_ioctls(uffd, addr, len, + miss, wp, minor, NULL); +} + +int uffd_unregister(int uffd, void *addr, uint64_t len) +{ + struct uffdio_range range = { .start = (uintptr_t)addr, .len = len }; + int ret = 0; + + if (ioctl(uffd, UFFDIO_UNREGISTER, &range) == -1) + ret = -errno; + + return ret; +} + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h new file mode 100644 index 000000000000..9007c420d52c --- /dev/null +++ b/tools/testing/selftests/mm/vm_util.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <stdint.h> +#include <stdbool.h> +#include <sys/mman.h> +#include <err.h> +#include <strings.h> /* ffsl() */ +#include <unistd.h> /* _SC_PAGESIZE */ + +#define BIT_ULL(nr) (1ULL << (nr)) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + +extern unsigned int __page_size; +extern unsigned int __page_shift; + +static inline unsigned int psize(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int pshift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(psize()) - 1); + return __page_shift; +} + +uint64_t pagemap_get_entry(int fd, char *start); +bool pagemap_is_softdirty(int fd, char *start); +bool pagemap_is_swapped(int fd, char *start); +bool pagemap_is_populated(int fd, char *start); +unsigned long pagemap_get_pfn(int fd, char *start); +void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); +uint64_t read_pmd_pagesize(void); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); +int64_t allocate_transhuge(void *ptr, int pagemap_fd); +unsigned long default_huge_page_size(void); +int detect_hugetlb_page_sizes(size_t sizes[], int max); + +int uffd_register(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor); +int uffd_unregister(int uffd, void *addr, uint64_t len); +int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, + bool miss, bool wp, bool minor, uint64_t *ioctls); +unsigned long get_free_hugepages(void); + +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh index d3d0d108924d..3d2d2eb9d6ff 100644..100755 --- a/tools/testing/selftests/vm/write_hugetlb_memory.sh +++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 set -e @@ -14,7 +14,7 @@ want_sleep=$8 reserve=$9 echo "Putting task in cgroup '$cgroup'" -echo $$ > /dev/cgroup/memory/"$cgroup"/cgroup.procs +echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs echo "Method is $method" diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c index 6a2caba19ee1..6a2caba19ee1 100644 --- a/tools/testing/selftests/vm/write_to_hugetlbfs.c +++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c |