aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/testing/selftests/mm
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests/mm')
-rw-r--r--tools/testing/selftests/mm/.gitignore62
-rw-r--r--tools/testing/selftests/mm/Makefile251
-rwxr-xr-xtools/testing/selftests/mm/charge_reserved_hugetlb.sh588
-rwxr-xr-xtools/testing/selftests/mm/check_config.sh31
-rw-r--r--tools/testing/selftests/mm/compaction_test.c276
-rw-r--r--tools/testing/selftests/mm/config13
-rw-r--r--tools/testing/selftests/mm/cow.c1915
-rw-r--r--tools/testing/selftests/mm/droppable.c53
-rw-r--r--tools/testing/selftests/mm/guard-regions.c2148
-rw-r--r--tools/testing/selftests/mm/gup_longterm.c516
-rw-r--r--tools/testing/selftests/mm/gup_test.c272
-rw-r--r--tools/testing/selftests/mm/hmm-tests.c2059
-rw-r--r--tools/testing/selftests/mm/hugepage-mmap.c78
-rw-r--r--tools/testing/selftests/mm/hugepage-mremap.c168
-rw-r--r--tools/testing/selftests/mm/hugepage-shm.c81
-rw-r--r--tools/testing/selftests/mm/hugepage-vmemmap.c134
-rw-r--r--tools/testing/selftests/mm/hugetlb-madvise.c368
-rw-r--r--tools/testing/selftests/mm/hugetlb-read-hwpoison.c322
-rw-r--r--tools/testing/selftests/mm/hugetlb-soft-offline.c228
-rw-r--r--tools/testing/selftests/mm/hugetlb_dio.c125
-rw-r--r--tools/testing/selftests/mm/hugetlb_fault_after_madv.c109
-rw-r--r--tools/testing/selftests/mm/hugetlb_madv_vs_map.c126
-rwxr-xr-xtools/testing/selftests/mm/hugetlb_reparenting_test.sh243
-rw-r--r--tools/testing/selftests/mm/khugepaged.c1287
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c719
-rw-r--r--tools/testing/selftests/mm/ksm_tests.c920
-rw-r--r--tools/testing/selftests/mm/madv_populate.c311
-rw-r--r--tools/testing/selftests/mm/map_fixed_noreplace.c201
-rw-r--r--tools/testing/selftests/mm/map_hugetlb.c88
-rw-r--r--tools/testing/selftests/mm/map_populate.c125
-rw-r--r--tools/testing/selftests/mm/mdwe_test.c303
-rw-r--r--tools/testing/selftests/mm/memfd_secret.c346
-rw-r--r--tools/testing/selftests/mm/merge.c501
-rw-r--r--tools/testing/selftests/mm/migration.c306
-rw-r--r--tools/testing/selftests/mm/mkdirty.c380
-rw-r--r--tools/testing/selftests/mm/mlock-random-test.c267
-rw-r--r--tools/testing/selftests/mm/mlock2-tests.c456
-rw-r--r--tools/testing/selftests/mm/mlock2.h54
-rw-r--r--tools/testing/selftests/mm/mrelease_test.c184
-rw-r--r--tools/testing/selftests/mm/mremap_dontunmap.c369
-rw-r--r--tools/testing/selftests/mm/mremap_test.c867
-rw-r--r--tools/testing/selftests/mm/mseal_helpers.h41
-rw-r--r--tools/testing/selftests/mm/mseal_test.c1989
-rw-r--r--tools/testing/selftests/mm/on-fault-limit.c42
-rw-r--r--tools/testing/selftests/mm/page_frag/Makefile18
-rw-r--r--tools/testing/selftests/mm/page_frag/page_frag_test.c198
-rw-r--r--tools/testing/selftests/mm/pagemap_ioctl.c1674
-rw-r--r--tools/testing/selftests/mm/pfnmap.c249
-rw-r--r--tools/testing/selftests/mm/pkey-arm64.h140
-rw-r--r--tools/testing/selftests/mm/pkey-helpers.h222
-rw-r--r--tools/testing/selftests/mm/pkey-powerpc.h145
-rw-r--r--tools/testing/selftests/mm/pkey-x86.h165
-rw-r--r--tools/testing/selftests/mm/pkey_sighandler_tests.c546
-rw-r--r--tools/testing/selftests/mm/pkey_util.c41
-rw-r--r--tools/testing/selftests/mm/protection_keys.c1794
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh527
-rw-r--r--tools/testing/selftests/mm/settings1
-rw-r--r--tools/testing/selftests/mm/soft-dirty.c213
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c572
-rwxr-xr-xtools/testing/selftests/mm/test_hmm.sh105
-rwxr-xr-xtools/testing/selftests/mm/test_page_frag.sh175
-rwxr-xr-xtools/testing/selftests/mm/test_vmalloc.sh177
-rw-r--r--tools/testing/selftests/mm/thp_settings.c383
-rw-r--r--tools/testing/selftests/mm/thp_settings.h87
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c249
-rw-r--r--tools/testing/selftests/mm/transhuge-stress.c138
-rw-r--r--tools/testing/selftests/mm/uffd-common.c718
-rw-r--r--tools/testing/selftests/mm/uffd-common.h137
-rw-r--r--tools/testing/selftests/mm/uffd-stress.c496
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c1777
-rw-r--r--tools/testing/selftests/mm/uffd-wp-mremap.c383
-rw-r--r--tools/testing/selftests/mm/va_high_addr_switch.c336
-rwxr-xr-xtools/testing/selftests/mm/va_high_addr_switch.sh80
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c246
-rw-r--r--tools/testing/selftests/mm/vm_util.c526
-rw-r--r--tools/testing/selftests/mm/vm_util.h127
-rwxr-xr-xtools/testing/selftests/mm/write_hugetlb_memory.sh23
-rw-r--r--tools/testing/selftests/mm/write_to_hugetlbfs.c243
78 files changed, 32833 insertions, 0 deletions
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
new file mode 100644
index 000000000000..824266982aa3
--- /dev/null
+++ b/tools/testing/selftests/mm/.gitignore
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: GPL-2.0-only
+cow
+hugepage-mmap
+hugepage-mremap
+hugepage-shm
+hugepage-vmemmap
+hugetlb-madvise
+hugetlb-read-hwpoison
+hugetlb-soft-offline
+khugepaged
+map_hugetlb
+map_populate
+thuge-gen
+compaction_test
+migration
+mlock2-tests
+mrelease_test
+mremap_dontunmap
+mremap_test
+on-fault-limit
+transhuge-stress
+pagemap_ioctl
+pfnmap
+*.tmp*
+protection_keys
+protection_keys_32
+protection_keys_64
+madv_populate
+uffd-stress
+uffd-unit-tests
+uffd-wp-mremap
+mlock-intersect-test
+mlock-random-test
+virtual_address_range
+gup_test
+va_128TBswitch
+map_fixed_noreplace
+write_to_hugetlbfs
+hmm-tests
+memfd_secret
+hugetlb_dio
+pkey_sighandler_tests_32
+pkey_sighandler_tests_64
+soft-dirty
+split_huge_page_test
+ksm_tests
+local_config.h
+local_config.mk
+ksm_functional_tests
+mdwe_test
+gup_longterm
+mkdirty
+va_high_addr_switch
+hugetlb_fault_after_madv
+hugetlb_madv_vs_map
+mseal_test
+droppable
+hugetlb_dio
+pkey_sighandler_tests_32
+pkey_sighandler_tests_64
+guard-regions
+merge
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
new file mode 100644
index 000000000000..ae6f994d3add
--- /dev/null
+++ b/tools/testing/selftests/mm/Makefile
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mm selftests
+
+LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
+LOCAL_HDRS += $(selfdir)/mm/mseal_helpers.h
+
+include local_config.mk
+
+ifeq ($(ARCH),)
+
+ifeq ($(CROSS_COMPILE),)
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+else
+uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+')
+endif
+ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/powerpc/')
+endif
+
+# Without this, failed build products remain, with up-to-date timestamps,
+# thus tricking Make (and you!) into believing that All Is Well, in subsequent
+# make invocations:
+.DELETE_ON_ERROR:
+
+# Avoid accidental wrong builds, due to built-in rules working just a little
+# bit too well--but not quite as well as required for our situation here.
+#
+# In other words, "make $SOME_TEST" is supposed to fail to build at all,
+# because this Makefile only supports either "make" (all), or "make /full/path".
+# However, the built-in rules, if not suppressed, will pick up CFLAGS and the
+# initial LDLIBS (but not the target-specific LDLIBS, because those are only
+# set for the full path target!). This causes it to get pretty far into building
+# things despite using incorrect values such as an *occasionally* incomplete
+# LDLIBS.
+MAKEFLAGS += --no-builtin-rules
+
+CFLAGS = -Wall -O2 -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS = -lrt -lpthread -lm
+
+# Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is
+# automatically enabled at -O1 or above. This triggers various unused-result
+# warnings where functions such as read() or write() are called and their
+# return value is not checked. Disable _FORTIFY_SOURCE to silence those
+# warnings.
+CFLAGS += -U_FORTIFY_SOURCE
+
+KDIR ?= /lib/modules/$(shell uname -r)/build
+ifneq (,$(wildcard $(KDIR)/Module.symvers))
+ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h))
+TEST_GEN_MODS_DIR := page_frag
+else
+PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel"
+endif
+else
+PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first"
+endif
+
+TEST_GEN_FILES = cow
+TEST_GEN_FILES += compaction_test
+TEST_GEN_FILES += gup_longterm
+TEST_GEN_FILES += gup_test
+TEST_GEN_FILES += hmm-tests
+TEST_GEN_FILES += hugetlb-madvise
+TEST_GEN_FILES += hugetlb-read-hwpoison
+TEST_GEN_FILES += hugetlb-soft-offline
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-mremap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
+TEST_GEN_FILES += khugepaged
+TEST_GEN_FILES += madv_populate
+TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
+ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64))
+TEST_GEN_FILES += memfd_secret
+endif
+TEST_GEN_FILES += migration
+TEST_GEN_FILES += mkdirty
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mrelease_test
+TEST_GEN_FILES += mremap_dontunmap
+TEST_GEN_FILES += mremap_test
+TEST_GEN_FILES += mseal_test
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += pagemap_ioctl
+TEST_GEN_FILES += pfnmap
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += uffd-stress
+TEST_GEN_FILES += uffd-unit-tests
+TEST_GEN_FILES += uffd-wp-mremap
+TEST_GEN_FILES += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
+TEST_GEN_FILES += ksm_functional_tests
+TEST_GEN_FILES += mdwe_test
+TEST_GEN_FILES += hugetlb_fault_after_madv
+TEST_GEN_FILES += hugetlb_madv_vs_map
+TEST_GEN_FILES += hugetlb_dio
+TEST_GEN_FILES += droppable
+TEST_GEN_FILES += guard-regions
+TEST_GEN_FILES += merge
+
+ifneq ($(ARCH),arm64)
+TEST_GEN_FILES += soft-dirty
+endif
+
+ifeq ($(ARCH),x86_64)
+CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
+
+VMTARGETS := protection_keys
+VMTARGETS += pkey_sighandler_tests
+BINARIES_32 := $(VMTARGETS:%=%_32)
+BINARIES_64 := $(VMTARGETS:%=%_64)
+
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+ifeq ($(CAN_BUILD_I386),1)
+TEST_GEN_FILES += $(BINARIES_32)
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+TEST_GEN_FILES += $(BINARIES_64)
+endif
+
+else ifeq ($(ARCH),arm64)
+TEST_GEN_FILES += protection_keys
+TEST_GEN_FILES += pkey_sighandler_tests
+else ifeq ($(ARCH),powerpc)
+TEST_GEN_FILES += protection_keys
+endif
+
+ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
+TEST_GEN_FILES += va_high_addr_switch
+ifneq ($(ARCH),riscv64)
+TEST_GEN_FILES += virtual_address_range
+endif
+TEST_GEN_FILES += write_to_hugetlbfs
+endif
+
+TEST_PROGS := run_vmtests.sh
+
+TEST_FILES := test_vmalloc.sh
+TEST_FILES += test_hmm.sh
+TEST_FILES += va_high_addr_switch.sh
+TEST_FILES += charge_reserved_hugetlb.sh
+TEST_FILES += hugetlb_reparenting_test.sh
+TEST_FILES += test_page_frag.sh
+
+# required by charge_reserved_hugetlb.sh
+TEST_FILES += write_hugetlb_memory.sh
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): vm_util.c thp_settings.c
+$(TEST_GEN_FILES): vm_util.c thp_settings.c
+
+$(OUTPUT)/uffd-stress: uffd-common.c
+$(OUTPUT)/uffd-unit-tests: uffd-common.c
+$(OUTPUT)/uffd-wp-mremap: uffd-common.c
+$(OUTPUT)/protection_keys: pkey_util.c
+$(OUTPUT)/pkey_sighandler_tests: pkey_util.c
+
+ifeq ($(ARCH),x86_64)
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+$(BINARIES_32) $(BINARIES_64): pkey_util.c
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+$(BINARIES_32): CFLAGS += -m32 -mxsave
+$(BINARIES_32): LDLIBS += -lrt -ldl -lm
+$(BINARIES_32): $(OUTPUT)/%_32: %.c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+$(BINARIES_64): CFLAGS += -m64 -mxsave
+$(BINARIES_64): LDLIBS += -lrt -ldl
+$(BINARIES_64): $(OUTPUT)/%_64: %.c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+ @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \
+ echo "environment. This will reduce test coverage of 64-bit" 2>&1; \
+ echo "kernels. If you are using a Debian-like distribution," 2>&1; \
+ echo "try:"; 2>&1; \
+ echo ""; \
+ echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
+ echo ""; \
+ echo "If you are using a Fedora-like distribution, try:"; \
+ echo ""; \
+ echo " yum install glibc-devel.*i686"; \
+ exit 0;
+endif
+endif
+
+# IOURING_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/cow: LDLIBS += $(IOURING_EXTRA_LIBS)
+
+$(OUTPUT)/gup_longterm: LDLIBS += $(IOURING_EXTRA_LIBS)
+
+$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
+
+$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
+
+$(OUTPUT)/migration: LDLIBS += -lnuma
+
+local_config.mk local_config.h: check_config.sh
+ /bin/sh ./check_config.sh $(CC)
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(IOURING_EXTRA_LIBS),)
+all: warn_missing_liburing
+
+warn_missing_liburing:
+ @echo ; \
+ echo "Warning: missing liburing support. Some tests will be skipped." ; \
+ echo
+endif
+
+ifneq ($(PAGE_FRAG_WARNING),)
+all: warn_missing_page_frag
+
+warn_missing_page_frag:
+ @echo ; \
+ echo "Warning: $(PAGE_FRAG_WARNING). page_frag test will be skipped." ; \
+ echo
+endif
diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
new file mode 100755
index 000000000000..e1fe16bcbbe8
--- /dev/null
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -0,0 +1,588 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+ echo "This test must be run as root. Skipping..."
+ exit $ksft_skip
+fi
+
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+
+fault_limit_file=limit_in_bytes
+reservation_limit_file=rsvd.limit_in_bytes
+fault_usage_file=usage_in_bytes
+reservation_usage_file=rsvd.usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+ cgroup2=1
+ fault_limit_file=max
+ reservation_limit_file=rsvd.max
+ fault_usage_file=current
+ reservation_usage_file=rsvd.current
+fi
+
+if [[ $cgroup2 ]]; then
+ cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}')
+ if [[ -z "$cgroup_path" ]]; then
+ cgroup_path=$(mktemp -d)
+ mount -t cgroup2 none $cgroup_path
+ do_umount=1
+ fi
+ echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
+else
+ cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
+ if [[ -z "$cgroup_path" ]]; then
+ cgroup_path=$(mktemp -d)
+ mount -t cgroup memory,hugetlb $cgroup_path
+ do_umount=1
+ fi
+fi
+export cgroup_path
+
+function cleanup() {
+ if [[ $cgroup2 ]]; then
+ echo $$ >$cgroup_path/cgroup.procs
+ else
+ echo $$ >$cgroup_path/tasks
+ fi
+
+ if [[ -e /mnt/huge ]]; then
+ rm -rf /mnt/huge/*
+ umount /mnt/huge || echo error
+ rmdir /mnt/huge
+ fi
+ if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
+ rmdir $cgroup_path/hugetlb_cgroup_test
+ fi
+ if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
+ rmdir $cgroup_path/hugetlb_cgroup_test1
+ fi
+ if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
+ rmdir $cgroup_path/hugetlb_cgroup_test2
+ fi
+ echo 0 >/proc/sys/vm/nr_hugepages
+ echo CLEANUP DONE
+}
+
+function expect_equal() {
+ local expected="$1"
+ local actual="$2"
+ local error="$3"
+
+ if [[ "$expected" != "$actual" ]]; then
+ echo "expected ($expected) != actual ($actual): $3"
+ cleanup
+ exit 1
+ fi
+}
+
+function get_machine_hugepage_size() {
+ hpz=$(grep -i hugepagesize /proc/meminfo)
+ kb=${hpz:14:-3}
+ mb=$(($kb / 1024))
+ echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function setup_cgroup() {
+ local name="$1"
+ local cgroup_limit="$2"
+ local reservation_limit="$3"
+
+ mkdir $cgroup_path/$name
+
+ echo writing cgroup limit: "$cgroup_limit"
+ echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
+
+ echo writing reseravation limit: "$reservation_limit"
+ echo "$reservation_limit" > \
+ $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
+
+ if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
+ echo 0 >$cgroup_path/$name/cpuset.cpus
+ fi
+ if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
+ echo 0 >$cgroup_path/$name/cpuset.mems
+ fi
+}
+
+function wait_for_hugetlb_memory_to_get_depleted() {
+ local cgroup="$1"
+ local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+ # Wait for hugetlbfs memory to get depleted.
+ while [ $(cat $path) != 0 ]; do
+ echo Waiting for hugetlb memory to get depleted.
+ cat $path
+ sleep 0.5
+ done
+}
+
+function wait_for_hugetlb_memory_to_get_reserved() {
+ local cgroup="$1"
+ local size="$2"
+
+ local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+ # Wait for hugetlbfs memory to get written.
+ while [ $(cat $path) != $size ]; do
+ echo Waiting for hugetlb memory reservation to reach size $size.
+ cat $path
+ sleep 0.5
+ done
+}
+
+function wait_for_hugetlb_memory_to_get_written() {
+ local cgroup="$1"
+ local size="$2"
+
+ local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
+ # Wait for hugetlbfs memory to get written.
+ while [ $(cat $path) != $size ]; do
+ echo Waiting for hugetlb memory to reach size $size.
+ cat $path
+ sleep 0.5
+ done
+}
+
+function write_hugetlbfs_and_get_usage() {
+ local cgroup="$1"
+ local size="$2"
+ local populate="$3"
+ local write="$4"
+ local path="$5"
+ local method="$6"
+ local private="$7"
+ local expect_failure="$8"
+ local reserve="$9"
+
+ # Function return values.
+ reservation_failed=0
+ oom_killed=0
+ hugetlb_difference=0
+ reserved_difference=0
+
+ local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
+ local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
+
+ local hugetlb_before=$(cat $hugetlb_usage)
+ local reserved_before=$(cat $reserved_usage)
+
+ echo
+ echo Starting:
+ echo hugetlb_usage="$hugetlb_before"
+ echo reserved_usage="$reserved_before"
+ echo expect_failure is "$expect_failure"
+
+ output=$(mktemp)
+ set +e
+ if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
+ [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then
+
+ bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+ "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &
+
+ local write_result=$?
+ local write_pid=$!
+
+ until grep -q -i "DONE" $output; do
+ echo waiting for DONE signal.
+ if ! ps $write_pid > /dev/null
+ then
+ echo "FAIL: The write died"
+ cleanup
+ exit 1
+ fi
+ sleep 0.5
+ done
+
+ echo ================= write_hugetlb_memory.sh output is:
+ cat $output
+ echo ================= end output.
+
+ if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
+ wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
+ elif [[ "$reserve" != "-n" ]]; then
+ wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+ else
+ # This case doesn't produce visible effects, but we still have
+ # to wait for the async process to start and execute...
+ sleep 0.5
+ fi
+
+ echo write_result is $write_result
+ else
+ bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+ "$cgroup" "$path" "$method" "$private" "$reserve"
+ local write_result=$?
+
+ if [[ "$reserve" != "-n" ]]; then
+ wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+ fi
+ fi
+ set -e
+
+ if [[ "$write_result" == 1 ]]; then
+ reservation_failed=1
+ fi
+
+ # On linus/master, the above process gets SIGBUS'd on oomkill, with
+ # return code 135. On earlier kernels, it gets actual oomkill, with return
+ # code 137, so just check for both conditions in case we're testing
+ # against an earlier kernel.
+ if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
+ oom_killed=1
+ fi
+
+ local hugetlb_after=$(cat $hugetlb_usage)
+ local reserved_after=$(cat $reserved_usage)
+
+ echo After write:
+ echo hugetlb_usage="$hugetlb_after"
+ echo reserved_usage="$reserved_after"
+
+ hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
+ reserved_difference=$(($reserved_after - $reserved_before))
+}
+
+function cleanup_hugetlb_memory() {
+ set +e
+ local cgroup="$1"
+ if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
+ echo killing write_to_hugetlbfs
+ killall -2 --wait write_to_hugetlbfs
+ wait_for_hugetlb_memory_to_get_depleted $cgroup
+ fi
+ set -e
+
+ if [[ -e /mnt/huge ]]; then
+ rm -rf /mnt/huge/*
+ umount /mnt/huge
+ rmdir /mnt/huge
+ fi
+}
+
+function run_test() {
+ local size=$(($1 * ${MB} * 1024 * 1024))
+ local populate="$2"
+ local write="$3"
+ local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
+ local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
+ local nr_hugepages="$6"
+ local method="$7"
+ local private="$8"
+ local expect_failure="$9"
+ local reserve="${10}"
+
+ # Function return values.
+ hugetlb_difference=0
+ reserved_difference=0
+ reservation_failed=0
+ oom_killed=0
+
+ echo nr hugepages = "$nr_hugepages"
+ echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+ setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
+
+ mkdir -p /mnt/huge
+ mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+ write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
+ "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
+ "$reserve"
+
+ cleanup_hugetlb_memory "hugetlb_cgroup_test"
+
+ local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
+ local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
+
+ echo $hugetlb_difference
+ echo $reserved_difference
+ expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
+ expect_equal "0" "$final_reservation" "final reservation is not zero"
+}
+
+function run_multiple_cgroup_test() {
+ local size1="$1"
+ local populate1="$2"
+ local write1="$3"
+ local cgroup_limit1="$4"
+ local reservation_limit1="$5"
+
+ local size2="$6"
+ local populate2="$7"
+ local write2="$8"
+ local cgroup_limit2="$9"
+ local reservation_limit2="${10}"
+
+ local nr_hugepages="${11}"
+ local method="${12}"
+ local private="${13}"
+ local expect_failure="${14}"
+ local reserve="${15}"
+
+ # Function return values.
+ hugetlb_difference1=0
+ reserved_difference1=0
+ reservation_failed1=0
+ oom_killed1=0
+
+ hugetlb_difference2=0
+ reserved_difference2=0
+ reservation_failed2=0
+ oom_killed2=0
+
+ echo nr hugepages = "$nr_hugepages"
+ echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+ setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
+ setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
+
+ mkdir -p /mnt/huge
+ mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+ write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
+ "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
+ "$expect_failure" "$reserve"
+
+ hugetlb_difference1=$hugetlb_difference
+ reserved_difference1=$reserved_difference
+ reservation_failed1=$reservation_failed
+ oom_killed1=$oom_killed
+
+ local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
+ local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
+ local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
+ local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
+
+ local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
+ local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
+
+ write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
+ "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
+ "$expect_failure" "$reserve"
+
+ hugetlb_difference2=$hugetlb_difference
+ reserved_difference2=$reserved_difference
+ reservation_failed2=$reservation_failed
+ oom_killed2=$oom_killed
+
+ expect_equal "$usage_before_second_write" \
+ "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
+ expect_equal "$reservation_usage_before_second_write" \
+ "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."
+
+ cleanup_hugetlb_memory
+
+ local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
+ local final_reservation=$(cat $cgroup1_reservation_usage)
+
+ expect_equal "0" "$final_hugetlb" \
+ "hugetlbt_cgroup_test1 final hugetlb is not zero"
+ expect_equal "0" "$final_reservation" \
+ "hugetlbt_cgroup_test1 final reservation is not zero"
+
+ local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
+ local final_reservation=$(cat $cgroup2_reservation_usage)
+
+ expect_equal "0" "$final_hugetlb" \
+ "hugetlb_cgroup_test2 final hugetlb is not zero"
+ expect_equal "0" "$final_reservation" \
+ "hugetlb_cgroup_test2 final reservation is not zero"
+}
+
+cleanup
+
+for populate in "" "-o"; do
+ for method in 0 1 2; do
+ for private in "" "-r"; do
+ for reserve in "" "-n"; do
+
+ # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
+ if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
+ continue
+ fi
+
+ # Skip populated shmem tests. Doesn't seem to be supported.
+ if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
+ continue
+ fi
+
+ if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
+ continue
+ fi
+
+ cleanup
+ echo
+ echo
+ echo
+ echo Test normal case.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb=$hugetlb_difference
+ echo Memory charged to reservation=$reserved_difference
+
+ if [[ "$populate" == "-o" ]]; then
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+ "Reserved memory charged to hugetlb cgroup."
+ else
+ expect_equal "0" "$hugetlb_difference" \
+ "Reserved memory charged to hugetlb cgroup."
+ fi
+
+ if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+ "Reserved memory not charged to reservation usage."
+ else
+ expect_equal "0" "$reserved_difference" \
+ "Reserved memory not charged to reservation usage."
+ fi
+
+ echo 'PASS'
+
+ cleanup
+ echo
+ echo
+ echo
+ echo Test normal case with write.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb=$hugetlb_difference
+ echo Memory charged to reservation=$reserved_difference
+
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+ "Reserved memory charged to hugetlb cgroup."
+
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+ "Reserved memory not charged to reservation usage."
+
+ echo 'PASS'
+
+ cleanup
+ continue
+ echo
+ echo
+ echo
+ echo Test more than reservation case.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+ if [ "$reserve" != "-n" ]; then
+ run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
+ "$reserve"
+
+ expect_equal "1" "$reservation_failed" "Reservation succeeded."
+ fi
+
+ echo 'PASS'
+
+ cleanup
+
+ echo
+ echo
+ echo
+ echo Test more than cgroup limit case.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+ # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
+ if [[ "$method" != 2 ]]; then
+ run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"
+
+ expect_equal "1" "$oom_killed" "Not oom killed."
+ fi
+ echo 'PASS'
+
+ cleanup
+
+ echo
+ echo
+ echo
+ echo Test normal case, multiple cgroups.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
+ "$populate" "" "10" "10" "10" \
+ "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb1=$hugetlb_difference1
+ echo Memory charged to reservation1=$reserved_difference1
+ echo Memory charged to hugtlb2=$hugetlb_difference2
+ echo Memory charged to reservation2=$reserved_difference2
+
+ if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+ expect_equal "3" "$reserved_difference1" \
+ "Incorrect reservations charged to cgroup 1."
+
+ expect_equal "5" "$reserved_difference2" \
+ "Incorrect reservation charged to cgroup 2."
+
+ else
+ expect_equal "0" "$reserved_difference1" \
+ "Incorrect reservations charged to cgroup 1."
+
+ expect_equal "0" "$reserved_difference2" \
+ "Incorrect reservation charged to cgroup 2."
+ fi
+
+ if [[ "$populate" == "-o" ]]; then
+ expect_equal "3" "$hugetlb_difference1" \
+ "Incorrect hugetlb charged to cgroup 1."
+
+ expect_equal "5" "$hugetlb_difference2" \
+ "Incorrect hugetlb charged to cgroup 2."
+
+ else
+ expect_equal "0" "$hugetlb_difference1" \
+ "Incorrect hugetlb charged to cgroup 1."
+
+ expect_equal "0" "$hugetlb_difference2" \
+ "Incorrect hugetlb charged to cgroup 2."
+ fi
+ echo 'PASS'
+
+ cleanup
+ echo
+ echo
+ echo
+ echo Test normal case with write, multiple cgroups.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
+ "$populate" "-w" "10" "10" "10" \
+ "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb1=$hugetlb_difference1
+ echo Memory charged to reservation1=$reserved_difference1
+ echo Memory charged to hugtlb2=$hugetlb_difference2
+ echo Memory charged to reservation2=$reserved_difference2
+
+ expect_equal "3" "$hugetlb_difference1" \
+ "Incorrect hugetlb charged to cgroup 1."
+
+ expect_equal "3" "$reserved_difference1" \
+ "Incorrect reservation charged to cgroup 1."
+
+ expect_equal "5" "$hugetlb_difference2" \
+ "Incorrect hugetlb charged to cgroup 2."
+
+ expect_equal "5" "$reserved_difference2" \
+ "Incorrected reservation charged to cgroup 2."
+ echo 'PASS'
+
+ cleanup
+
+ done # reserve
+ done # private
+ done # populate
+done # method
+
+if [[ $do_umount ]]; then
+ umount $cgroup_path
+ rmdir $cgroup_path
+fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
new file mode 100755
index 000000000000..3954f4746161
--- /dev/null
+++ b/tools/testing/selftests/mm/check_config.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+# liburing
+echo "#include <sys/types.h>" > $tmpfile_c
+echo "#include <liburing.h>" >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+ echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE
+ echo "IOURING_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE
+else
+ echo "// No liburing support found" > $OUTPUT_H_FILE
+ echo "# No liburing support found, so:" > $OUTPUT_MKFILE
+ echo "IOURING_EXTRA_LIBS = " >> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
new file mode 100644
index 000000000000..9bc4591c7b16
--- /dev/null
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define MAP_SIZE_MB 100
+#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024)
+
+struct map_list {
+ void *map;
+ struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+ char buffer[256] = {0};
+ char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+ FILE *cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ ksft_print_msg("Failed to read meminfo: %s\n", strerror(errno));
+ return -1;
+ }
+
+ pclose(cmdfile);
+
+ *memfree = atoll(buffer);
+ cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+ cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ ksft_print_msg("Failed to read meminfo: %s\n", strerror(errno));
+ return -1;
+ }
+
+ pclose(cmdfile);
+ *hugepagesize = atoll(buffer);
+
+ return 0;
+}
+
+int prereq(void)
+{
+ char allowed;
+ int fd;
+
+ fd = open("/proc/sys/vm/compact_unevictable_allowed",
+ O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ ksft_print_msg("Failed to open /proc/sys/vm/compact_unevictable_allowed: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+ ksft_print_msg("Failed to read from /proc/sys/vm/compact_unevictable_allowed: %s\n",
+ strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ if (allowed == '1')
+ return 0;
+
+ ksft_print_msg("Compaction isn't allowed\n");
+ return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned long hugepage_size,
+ unsigned long initial_nr_hugepages)
+{
+ unsigned long nr_hugepages_ul;
+ int fd, ret = -1;
+ int compaction_index = 0;
+ char nr_hugepages[20] = {0};
+ char init_nr_hugepages[24] = {0};
+ char target_nr_hugepages[24] = {0};
+ int slen;
+
+ snprintf(init_nr_hugepages, sizeof(init_nr_hugepages),
+ "%lu", initial_nr_hugepages);
+
+ /* We want to test with 80% of available memory. Else, OOM killer comes
+ in to play */
+ mem_free = mem_free * 0.8;
+
+ fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+ if (fd < 0) {
+ ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Request huge pages for about half of the free memory. The Kernel
+ * will allocate as much as it can, and we expect it will get at least 1/3
+ */
+ nr_hugepages_ul = mem_free / hugepage_size / 2;
+ snprintf(target_nr_hugepages, sizeof(target_nr_hugepages),
+ "%lu", nr_hugepages_ul);
+
+ slen = strlen(target_nr_hugepages);
+ if (write(fd, target_nr_hugepages, slen) != slen) {
+ ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n",
+ nr_hugepages_ul, strerror(errno));
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+ ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ goto close_fd;
+ }
+
+ /* We should have been able to request at least 1/3 rd of the memory in
+ huge pages */
+ nr_hugepages_ul = strtoul(nr_hugepages, NULL, 10);
+ if (!nr_hugepages_ul) {
+ ksft_print_msg("ERROR: No memory is available as huge pages\n");
+ goto close_fd;
+ }
+ compaction_index = mem_free/(nr_hugepages_ul * hugepage_size);
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (write(fd, init_nr_hugepages, strlen(init_nr_hugepages))
+ != strlen(init_nr_hugepages)) {
+ ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ goto close_fd;
+ }
+
+ ksft_print_msg("Number of huge pages allocated = %lu\n",
+ nr_hugepages_ul);
+
+ if (compaction_index > 3) {
+ ksft_print_msg("ERROR: Less than 1/%d of memory is available\n"
+ "as huge pages\n", compaction_index);
+ goto close_fd;
+ }
+
+ ret = 0;
+
+ close_fd:
+ close(fd);
+ out:
+ ksft_test_result(ret == 0, "check_compaction\n");
+ return ret;
+}
+
+int set_zero_hugepages(unsigned long *initial_nr_hugepages)
+{
+ int fd, ret = -1;
+ char nr_hugepages[20] = {0};
+
+ fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+ if (fd < 0) {
+ ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+ ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ /* Start with the initial condition of 0 huge pages */
+ if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+ ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n",
+ strerror(errno));
+ goto close_fd;
+ }
+
+ *initial_nr_hugepages = strtoul(nr_hugepages, NULL, 10);
+ ret = 0;
+
+ close_fd:
+ close(fd);
+
+ out:
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit lim;
+ struct map_list *list = NULL, *entry;
+ size_t page_size, i;
+ void *map = NULL;
+ unsigned long mem_free = 0;
+ unsigned long hugepage_size = 0;
+ long mem_fragmentable_MB = 0;
+ unsigned long initial_nr_hugepages;
+
+ ksft_print_header();
+
+ if (prereq() || geteuid())
+ ksft_exit_skip("Prerequisites unsatisfied\n");
+
+ ksft_set_plan(1);
+
+ /* Start the test without hugepages reducing mem_free */
+ if (set_zero_hugepages(&initial_nr_hugepages))
+ ksft_exit_fail();
+
+ lim.rlim_cur = RLIM_INFINITY;
+ lim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_MEMLOCK, &lim))
+ ksft_exit_fail_msg("Failed to set rlimit: %s\n", strerror(errno));
+
+ page_size = getpagesize();
+
+ if (read_memory_info(&mem_free, &hugepage_size) != 0)
+ ksft_exit_fail_msg("Failed to get meminfo\n");
+
+ mem_fragmentable_MB = mem_free * 0.8 / 1024;
+
+ while (mem_fragmentable_MB > 0) {
+ map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+ if (map == MAP_FAILED)
+ break;
+
+ entry = malloc(sizeof(struct map_list));
+ if (!entry) {
+ munmap(map, MAP_SIZE);
+ break;
+ }
+ entry->map = map;
+ entry->next = list;
+ list = entry;
+
+ /* Write something (in this case the address of the map) to
+ * ensure that KSM can't merge the mapped pages
+ */
+ for (i = 0; i < MAP_SIZE; i += page_size)
+ *(unsigned long *)(map + i) = (unsigned long)map + i;
+
+ mem_fragmentable_MB -= MAP_SIZE_MB;
+ }
+
+ for (entry = list; entry != NULL; entry = entry->next) {
+ munmap(entry->map, MAP_SIZE);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+
+ if (check_compaction(mem_free, hugepage_size,
+ initial_nr_hugepages) == 0)
+ ksft_exit_pass();
+
+ ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
new file mode 100644
index 000000000000..deba93379c80
--- /dev/null
+++ b/tools/testing/selftests/mm/config
@@ -0,0 +1,13 @@
+CONFIG_SYSVIPC=y
+CONFIG_USERFAULTFD=y
+CONFIG_PTE_MARKER_UFFD_WP=y
+CONFIG_TEST_VMALLOC=m
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_TEST_HMM=m
+CONFIG_GUP_TEST=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_MEM_SOFT_DIRTY=y
+CONFIG_ANON_VMA_NAME=y
+CONFIG_FTRACE=y
+CONFIG_PROFILING=y
+CONFIG_UPROBES=y
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
new file mode 100644
index 000000000000..dbbcc5eb3dce
--- /dev/null
+++ b/tools/testing/selftests/mm/cow.c
@@ -0,0 +1,1915 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * COW (Copy On Write) tests.
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <linux/memfd.h>
+
+#include "local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+#include <liburing.h>
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+#include "../../../../mm/gup_test.h"
+#include "../kselftest.h"
+#include "vm_util.h"
+#include "thp_settings.h"
+
+static size_t pagesize;
+static int pagemap_fd;
+static size_t pmdsize;
+static int nr_thpsizes;
+static size_t thpsizes[20];
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+static int gup_fd;
+static bool has_huge_zeropage;
+
+static int sz2ord(size_t size)
+{
+ return __builtin_ctzll(size / pagesize);
+}
+
+static int detect_thp_sizes(size_t sizes[], int max)
+{
+ int count = 0;
+ unsigned long orders;
+ size_t kb;
+ int i;
+
+ /* thp not supported at all. */
+ if (!pmdsize)
+ return 0;
+
+ orders = 1UL << sz2ord(pmdsize);
+ orders |= thp_supported_orders();
+
+ for (i = 0; orders && count < max; i++) {
+ if (!(orders & (1UL << i)))
+ continue;
+ orders &= ~(1UL << i);
+ kb = (pagesize >> 10) << i;
+ sizes[count++] = kb * 1024;
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
+ }
+
+ return count;
+}
+
+static void detect_huge_zeropage(void)
+{
+ int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
+ O_RDONLY);
+ size_t enabled = 0;
+ char buf[15];
+ int ret;
+
+ if (fd < 0)
+ return;
+
+ ret = pread(fd, buf, sizeof(buf), 0);
+ if (ret > 0 && ret < sizeof(buf)) {
+ buf[ret] = 0;
+
+ enabled = strtoul(buf, NULL, 10);
+ if (enabled == 1) {
+ has_huge_zeropage = true;
+ ksft_print_msg("[INFO] huge zeropage is enabled\n");
+ }
+ }
+
+ close(fd);
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+ for (; size; addr += pagesize, size -= pagesize)
+ if (!pagemap_is_swapped(pagemap_fd, addr))
+ return false;
+ return true;
+}
+
+struct comm_pipes {
+ int child_ready[2];
+ int parent_ready[2];
+};
+
+static int setup_comm_pipes(struct comm_pipes *comm_pipes)
+{
+ if (pipe(comm_pipes->child_ready) < 0) {
+ ksft_perror("pipe()");
+ return -errno;
+ }
+ if (pipe(comm_pipes->parent_ready) < 0) {
+ ksft_perror("pipe()");
+ close(comm_pipes->child_ready[0]);
+ close(comm_pipes->child_ready[1]);
+ return -errno;
+ }
+
+ return 0;
+}
+
+static void close_comm_pipes(struct comm_pipes *comm_pipes)
+{
+ close(comm_pipes->child_ready[0]);
+ close(comm_pipes->child_ready[1]);
+ close(comm_pipes->parent_ready[0]);
+ close(comm_pipes->parent_ready[1]);
+}
+
+static int child_memcmp_fn(char *mem, size_t size,
+ struct comm_pipes *comm_pipes)
+{
+ char *old = malloc(size);
+ char buf;
+
+ /* Backup the original content. */
+ memcpy(old, mem, size);
+
+ /* Wait until the parent modified the page. */
+ write(comm_pipes->child_ready[1], "0", 1);
+ while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+ ;
+
+ /* See if we still read the old values. */
+ return memcmp(old, mem, size);
+}
+
+static int child_vmsplice_memcmp_fn(char *mem, size_t size,
+ struct comm_pipes *comm_pipes)
+{
+ struct iovec iov = {
+ .iov_base = mem,
+ .iov_len = size,
+ };
+ ssize_t cur, total, transferred;
+ char *old, *new;
+ int fds[2];
+ char buf;
+
+ old = malloc(size);
+ new = malloc(size);
+
+ /* Backup the original content. */
+ memcpy(old, mem, size);
+
+ if (pipe(fds) < 0)
+ return -errno;
+
+ /* Trigger a read-only pin. */
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred < 0)
+ return -errno;
+ if (transferred == 0)
+ return -EINVAL;
+
+ /* Unmap it from our page tables. */
+ if (munmap(mem, size) < 0)
+ return -errno;
+
+ /* Wait until the parent modified it. */
+ write(comm_pipes->child_ready[1], "0", 1);
+ while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+ ;
+
+ /* See if we still read the old values via the pipe. */
+ for (total = 0; total < transferred; total += cur) {
+ cur = read(fds[0], new + total, transferred - total);
+ if (cur < 0)
+ return -errno;
+ }
+
+ return memcmp(old, new, transferred);
+}
+
+typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
+
+static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
+ child_fn fn, bool xfail)
+{
+ struct comm_pipes comm_pipes;
+ char buf;
+ int ret;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ } else if (!ret) {
+ exit(fn(mem, size, &comm_pipes));
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+
+ if (do_mprotect) {
+ /*
+ * mprotect() optimizations might try avoiding
+ * write-faults by directly mapping pages writable.
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+
+ ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+ }
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+
+ if (!ret) {
+ log_test_result(KSFT_PASS);
+ } else if (xfail) {
+ /*
+ * With hugetlb, some vmsplice() tests are currently expected to
+ * fail because (a) harder to fix and (b) nobody really cares.
+ * Flag them as expected failure for now.
+ */
+ log_test_result(KSFT_XFAIL);
+ } else {
+ log_test_result(KSFT_FAIL);
+ }
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_cow_in_parent(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_cow_in_parent(mem, size, false, child_memcmp_fn, false);
+}
+
+static void test_cow_in_parent_mprotect(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_cow_in_parent(mem, size, true, child_memcmp_fn, false);
+}
+
+static void test_vmsplice_in_child(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn,
+ is_hugetlb);
+}
+
+static void test_vmsplice_in_child_mprotect(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn,
+ is_hugetlb);
+}
+
+static void do_test_vmsplice_in_parent(char *mem, size_t size,
+ bool before_fork, bool xfail)
+{
+ struct iovec iov = {
+ .iov_base = mem,
+ .iov_len = size,
+ };
+ ssize_t cur, total, transferred = 0;
+ struct comm_pipes comm_pipes;
+ char *old, *new;
+ int ret, fds[2];
+ char buf;
+
+ old = malloc(size);
+ new = malloc(size);
+
+ memcpy(old, mem, size);
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ log_test_result(KSFT_FAIL);
+ goto free;
+ }
+
+ if (pipe(fds) < 0) {
+ ksft_perror("pipe() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+
+ if (before_fork) {
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred <= 0) {
+ ksft_print_msg("vmsplice() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto close_pipe;
+ }
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_perror("fork() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto close_pipe;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ /* Modify page content in the child. */
+ memset(mem, 0xff, size);
+ exit(0);
+ }
+
+ if (!before_fork) {
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred <= 0) {
+ ksft_perror("vmsplice() failed");
+ log_test_result(KSFT_FAIL);
+ wait(&ret);
+ goto close_pipe;
+ }
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ if (munmap(mem, size) < 0) {
+ ksft_perror("munmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_pipe;
+ }
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ /* Wait until the child is done writing. */
+ wait(&ret);
+ if (!WIFEXITED(ret)) {
+ ksft_perror("wait() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_pipe;
+ }
+
+ /* See if we still read the old values. */
+ for (total = 0; total < transferred; total += cur) {
+ cur = read(fds[0], new + total, transferred - total);
+ if (cur < 0) {
+ ksft_perror("read() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_pipe;
+ }
+ }
+
+ if (!memcmp(old, new, transferred)) {
+ log_test_result(KSFT_PASS);
+ } else if (xfail) {
+ /*
+ * With hugetlb, some vmsplice() tests are currently expected to
+ * fail because (a) harder to fix and (b) nobody really cares.
+ * Flag them as expected failure for now.
+ */
+ log_test_result(KSFT_XFAIL);
+ } else {
+ log_test_result(KSFT_FAIL);
+ }
+close_pipe:
+ close(fds[0]);
+ close(fds[1]);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+free:
+ free(old);
+ free(new);
+}
+
+static void test_vmsplice_before_fork(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_vmsplice_in_parent(mem, size, true, is_hugetlb);
+}
+
+static void test_vmsplice_after_fork(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_vmsplice_in_parent(mem, size, false, is_hugetlb);
+}
+
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+static void do_test_iouring(char *mem, size_t size, bool use_fork)
+{
+ struct comm_pipes comm_pipes;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ struct io_uring ring;
+ ssize_t cur, total;
+ struct iovec iov;
+ char *buf, *tmp;
+ int ret, fd;
+ FILE *file;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ file = tmpfile();
+ if (!file) {
+ ksft_perror("tmpfile() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+ fd = fileno(file);
+ assert(fd);
+
+ tmp = malloc(size);
+ if (!tmp) {
+ ksft_print_msg("malloc() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto close_file;
+ }
+
+ /* Skip on errors, as we might just lack kernel support. */
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret < 0) {
+ ksft_print_msg("io_uring_queue_init() failed\n");
+ log_test_result(KSFT_SKIP);
+ goto free_tmp;
+ }
+
+ /*
+ * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
+ * | FOLL_LONGTERM the range.
+ *
+ * Skip on errors, as we might just lack kernel support or might not
+ * have sufficient MEMLOCK permissions.
+ */
+ iov.iov_base = mem;
+ iov.iov_len = size;
+ ret = io_uring_register_buffers(&ring, &iov, 1);
+ if (ret) {
+ ksft_print_msg("io_uring_register_buffers() failed\n");
+ log_test_result(KSFT_SKIP);
+ goto queue_exit;
+ }
+
+ if (use_fork) {
+ /*
+ * fork() and keep the child alive until we're done. Note that
+ * we expect the pinned page to not get shared with the child.
+ */
+ ret = fork();
+ if (ret < 0) {
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
+ goto unregister_buffers;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ exit(0);
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ } else {
+ /*
+ * Map the page R/O into the page table. Enable softdirty
+ * tracking to stop the page from getting mapped R/W immediately
+ * again by mprotect() optimizations. Note that we don't have an
+ * easy way to test if that worked (the pagemap does not export
+ * if the page is mapped R/O vs. R/W).
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto unregister_buffers;
+ }
+
+ clear_softdirty();
+ ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto unregister_buffers;
+ }
+ }
+
+ /*
+ * Modify the page and write page content as observed by the fixed
+ * buffer pin to the file so we can verify it.
+ */
+ memset(mem, 0xff, size);
+ sqe = io_uring_get_sqe(&ring);
+ if (!sqe) {
+ ksft_print_msg("io_uring_get_sqe() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto quit_child;
+ }
+ io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
+
+ ret = io_uring_submit(&ring);
+ if (ret < 0) {
+ ksft_print_msg("io_uring_submit() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto quit_child;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ ksft_print_msg("io_uring_wait_cqe() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto quit_child;
+ }
+
+ if (cqe->res != size) {
+ ksft_print_msg("write_fixed failed\n");
+ log_test_result(KSFT_FAIL);
+ goto quit_child;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ /* Read back the file content to the temporary buffer. */
+ total = 0;
+ while (total < size) {
+ cur = pread(fd, tmp + total, size - total, total);
+ if (cur < 0) {
+ ksft_print_msg("pread() failed\n");
+ log_test_result(KSFT_FAIL);
+ goto quit_child;
+ }
+ total += cur;
+ }
+
+ /* Finally, check if we read what we expected. */
+ if (!memcmp(mem, tmp, size))
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
+
+quit_child:
+ if (use_fork) {
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ }
+unregister_buffers:
+ io_uring_unregister_buffers(&ring);
+queue_exit:
+ io_uring_queue_exit(&ring);
+free_tmp:
+ free(tmp);
+close_file:
+ fclose(file);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_iouring_ro(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_iouring(mem, size, false);
+}
+
+static void test_iouring_fork(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_iouring(mem, size, true);
+}
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+enum ro_pin_test {
+ RO_PIN_TEST,
+ RO_PIN_TEST_SHARED,
+ RO_PIN_TEST_PREVIOUSLY_SHARED,
+ RO_PIN_TEST_RO_EXCLUSIVE,
+};
+
+static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
+ bool fast)
+{
+ struct pin_longterm_test args;
+ struct comm_pipes comm_pipes;
+ char *tmp, buf;
+ __u64 tmp_val;
+ int ret;
+
+ if (gup_fd < 0) {
+ ksft_print_msg("gup_test not available\n");
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ tmp = malloc(size);
+ if (!tmp) {
+ ksft_print_msg("malloc() failed\n");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ log_test_result(KSFT_FAIL);
+ goto free_tmp;
+ }
+
+ switch (test) {
+ case RO_PIN_TEST:
+ break;
+ case RO_PIN_TEST_SHARED:
+ case RO_PIN_TEST_PREVIOUSLY_SHARED:
+ /*
+ * Share the pages with our child. As the pages are not pinned,
+ * this should just work.
+ */
+ ret = fork();
+ if (ret < 0) {
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ exit(0);
+ }
+
+ /* Wait until our child is ready. */
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+
+ if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
+ /*
+ * Tell the child to quit now and wait until it quit.
+ * The pages should now be mapped R/O into our page
+ * tables, but they are no longer shared.
+ */
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ if (!WIFEXITED(ret))
+ ksft_print_msg("[INFO] wait() failed\n");
+ }
+ break;
+ case RO_PIN_TEST_RO_EXCLUSIVE:
+ /*
+ * Map the page R/O into the page table. Enable softdirty
+ * tracking to stop the page from getting mapped R/W immediately
+ * again by mprotect() optimizations. Note that we don't have an
+ * easy way to test if that worked (the pagemap does not export
+ * if the page is mapped R/O vs. R/W).
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ clear_softdirty();
+ ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ /* Take a R/O pin. This should trigger unsharing. */
+ args.addr = (__u64)(uintptr_t)mem;
+ args.size = size;
+ args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
+ if (ret) {
+ if (errno == EINVAL)
+ ret = KSFT_SKIP;
+ else
+ ret = KSFT_FAIL;
+ ksft_perror("PIN_LONGTERM_TEST_START failed");
+ log_test_result(ret);
+ goto wait;
+ }
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+
+ /*
+ * Read back the content via the pin to the temporary buffer and
+ * test if we observed the modification.
+ */
+ tmp_val = (__u64)(uintptr_t)tmp;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
+ if (ret) {
+ ksft_perror("PIN_LONGTERM_TEST_READ failed");
+ log_test_result(KSFT_FAIL);
+ } else {
+ if (!memcmp(mem, tmp, size))
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
+ }
+
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
+ if (ret)
+ ksft_perror("PIN_LONGTERM_TEST_STOP failed");
+wait:
+ switch (test) {
+ case RO_PIN_TEST_SHARED:
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ if (!WIFEXITED(ret))
+ ksft_perror("wait() failed");
+ break;
+ default:
+ break;
+ }
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+free_tmp:
+ free(tmp);
+}
+
+static void test_ro_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_shared(char *mem, size_t size, bool is_hugetlb)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_exclusive(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
+}
+
+static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
+}
+
+typedef void (*test_fn)(char *mem, size_t size, bool hugetlb);
+
+static void do_run_with_base_page(test_fn fn, bool swapout)
+{
+ char *mem;
+ int ret;
+
+ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
+ /* Ignore if not around on a kernel. */
+ if (ret && errno != EINVAL) {
+ ksft_perror("MADV_NOHUGEPAGE failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /* Populate a base page. */
+ memset(mem, 1, pagesize);
+
+ if (swapout) {
+ madvise(mem, pagesize, MADV_PAGEOUT);
+ if (!pagemap_is_swapped(pagemap_fd, mem)) {
+ ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
+ log_test_result(KSFT_SKIP);
+ goto munmap;
+ }
+ }
+
+ fn(mem, pagesize, false);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void run_with_base_page(test_fn fn, const char *desc)
+{
+ log_test_start("%s ... with base page", desc);
+ do_run_with_base_page(fn, false);
+}
+
+static void run_with_base_page_swap(test_fn fn, const char *desc)
+{
+ log_test_start("%s ... with swapped out base page", desc);
+ do_run_with_base_page(fn, true);
+}
+
+enum thp_run {
+ THP_RUN_PMD,
+ THP_RUN_PMD_SWAPOUT,
+ THP_RUN_PTE,
+ THP_RUN_PTE_SWAPOUT,
+ THP_RUN_SINGLE_PTE,
+ THP_RUN_SINGLE_PTE_SWAPOUT,
+ THP_RUN_PARTIAL_MREMAP,
+ THP_RUN_PARTIAL_SHARED,
+};
+
+static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
+{
+ char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
+ size_t size, mmap_size, mremap_size;
+ int ret;
+
+ /* For alignment purposes, we need twice the thp size. */
+ mmap_size = 2 * thpsize;
+ mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mmap_mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ /* We need a THP-aligned memory area. */
+ mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+ ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+ if (ret) {
+ ksft_perror("MADV_HUGEPAGE failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /*
+ * Try to populate a THP. Touch the first sub-page and test if
+ * we get the last sub-page populated automatically.
+ */
+ mem[0] = 1;
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_print_msg("Did not get a THP populated\n");
+ log_test_result(KSFT_SKIP);
+ goto munmap;
+ }
+ memset(mem, 1, thpsize);
+
+ size = thpsize;
+ switch (thp_run) {
+ case THP_RUN_PMD:
+ case THP_RUN_PMD_SWAPOUT:
+ assert(thpsize == pmdsize);
+ break;
+ case THP_RUN_PTE:
+ case THP_RUN_PTE_SWAPOUT:
+ /*
+ * Trigger PTE-mapping the THP by temporarily mapping a single
+ * subpage R/O. This is a noop if the THP is not pmdsize (and
+ * therefore already PTE-mapped).
+ */
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ break;
+ case THP_RUN_SINGLE_PTE:
+ case THP_RUN_SINGLE_PTE_SWAPOUT:
+ /*
+ * Discard all but a single subpage of that PTE-mapped THP. What
+ * remains is a single PTE mapping a single subpage.
+ */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
+ if (ret) {
+ ksft_perror("MADV_DONTNEED failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ size = pagesize;
+ break;
+ case THP_RUN_PARTIAL_MREMAP:
+ /*
+ * Remap half of the THP. We need some new memory location
+ * for that.
+ */
+ mremap_size = thpsize / 2;
+ mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mremap_mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
+ if (tmp != mremap_mem) {
+ ksft_perror("mremap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ size = mremap_size;
+ break;
+ case THP_RUN_PARTIAL_SHARED:
+ /*
+ * Share the first page of the THP with a child and quit the
+ * child. This will result in some parts of the THP never
+ * have been shared.
+ */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
+ if (ret) {
+ ksft_perror("MADV_DONTFORK failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ ret = fork();
+ if (ret < 0) {
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ } else if (!ret) {
+ exit(0);
+ }
+ wait(&ret);
+ /* Allow for sharing all pages again. */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
+ if (ret) {
+ ksft_perror("MADV_DOFORK failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ switch (thp_run) {
+ case THP_RUN_PMD_SWAPOUT:
+ case THP_RUN_PTE_SWAPOUT:
+ case THP_RUN_SINGLE_PTE_SWAPOUT:
+ madvise(mem, size, MADV_PAGEOUT);
+ if (!range_is_swapped(mem, size)) {
+ ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
+ log_test_result(KSFT_SKIP);
+ goto munmap;
+ }
+ break;
+ default:
+ break;
+ }
+
+ fn(mem, size, false);
+munmap:
+ munmap(mmap_mem, mmap_size);
+ if (mremap_mem != MAP_FAILED)
+ munmap(mremap_mem, mremap_size);
+}
+
+static void run_with_thp(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_PMD, size);
+}
+
+static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with swapped-out THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
+}
+
+static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with PTE-mapped THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_PTE, size);
+}
+
+static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
+}
+
+static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with single PTE of THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
+}
+
+static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
+}
+
+static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
+}
+
+static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
+{
+ log_test_start("%s ... with partially shared THP (%zu kB)",
+ desc, size / 1024);
+ do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
+}
+
+static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
+{
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+ char *mem, *dummy;
+
+ log_test_start("%s ... with hugetlb (%zu kB)", desc,
+ hugetlbsize / 1024);
+
+ flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
+
+ mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ /* Populate an huge page. */
+ memset(mem, 1, hugetlbsize);
+
+ /*
+ * We need a total of two hugetlb pages to handle COW/unsharing
+ * properly, otherwise we might get zapped by a SIGBUS.
+ */
+ dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (dummy == MAP_FAILED) {
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
+ goto munmap;
+ }
+ munmap(dummy, hugetlbsize);
+
+ fn(mem, hugetlbsize, true);
+munmap:
+ munmap(mem, hugetlbsize);
+}
+
+struct test_case {
+ const char *desc;
+ test_fn fn;
+};
+
+/*
+ * Test cases that are specific to anonymous pages: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_test_cases[] = {
+ /*
+ * Basic COW tests for fork() without any GUP. If we miss to break COW,
+ * either the child can observe modifications by the parent or the
+ * other way around.
+ */
+ {
+ "Basic COW after fork()",
+ test_cow_in_parent,
+ },
+ /*
+ * Basic test, but do an additional mprotect(PROT_READ)+
+ * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
+ */
+ {
+ "Basic COW after fork() with mprotect() optimization",
+ test_cow_in_parent_mprotect,
+ },
+ /*
+ * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
+ * we miss to break COW, the child observes modifications by the parent.
+ * This is CVE-2020-29374 reported by Jann Horn.
+ */
+ {
+ "vmsplice() + unmap in child",
+ test_vmsplice_in_child,
+ },
+ /*
+ * vmsplice() test, but do an additional mprotect(PROT_READ)+
+ * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
+ */
+ {
+ "vmsplice() + unmap in child with mprotect() optimization",
+ test_vmsplice_in_child_mprotect,
+ },
+ /*
+ * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
+ * fork(); modify in the child. If we miss to break COW, the parent
+ * observes modifications by the child.
+ */
+ {
+ "vmsplice() before fork(), unmap in parent after fork()",
+ test_vmsplice_before_fork,
+ },
+ /*
+ * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
+ * child. If we miss to break COW, the parent observes modifications by
+ * the child.
+ */
+ {
+ "vmsplice() + unmap in parent after fork()",
+ test_vmsplice_after_fork,
+ },
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+ /*
+ * Take a R/W longterm pin and then map the page R/O into the page
+ * table to trigger a write fault on next access. When modifying the
+ * page, the page content must be visible via the pin.
+ */
+ {
+ "R/O-mapping a page registered as iouring fixed buffer",
+ test_iouring_ro,
+ },
+ /*
+ * Take a R/W longterm pin and then fork() a child. When modifying the
+ * page, the page content must be visible via the pin. We expect the
+ * pinned page to not get shared with the child.
+ */
+ {
+ "fork() with an iouring fixed buffer",
+ test_iouring_fork,
+ },
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+ /*
+ * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
+ * When modifying the page via the page table, the page content change
+ * must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped shared page",
+ test_ro_pin_on_shared,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped shared page",
+ test_ro_fast_pin_on_shared,
+ },
+ /*
+ * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
+ * was previously shared. When modifying the page via the page table,
+ * the page content change must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped previously-shared page",
+ test_ro_pin_on_ro_previously_shared,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped previously-shared page",
+ test_ro_fast_pin_on_ro_previously_shared,
+ },
+ /*
+ * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
+ * When modifying the page via the page table, the page content change
+ * must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped exclusive page",
+ test_ro_pin_on_ro_exclusive,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped exclusive page",
+ test_ro_fast_pin_on_ro_exclusive,
+ },
+};
+
+static void run_anon_test_case(struct test_case const *test_case)
+{
+ int i;
+
+ run_with_base_page(test_case->fn, test_case->desc);
+ run_with_base_page_swap(test_case->fn, test_case->desc);
+ for (i = 0; i < nr_thpsizes; i++) {
+ size_t size = thpsizes[i];
+ struct thp_settings settings = *thp_current_settings();
+
+ settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER;
+ settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
+ thp_push_settings(&settings);
+
+ if (size == pmdsize) {
+ run_with_thp(test_case->fn, test_case->desc, size);
+ run_with_thp_swap(test_case->fn, test_case->desc, size);
+ }
+
+ run_with_pte_mapped_thp(test_case->fn, test_case->desc, size);
+ run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size);
+ run_with_single_pte_of_thp(test_case->fn, test_case->desc, size);
+ run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size);
+ run_with_partial_mremap_thp(test_case->fn, test_case->desc, size);
+ run_with_partial_shared_thp(test_case->fn, test_case->desc, size);
+
+ thp_pop_settings();
+ }
+ for (i = 0; i < nr_hugetlbsizes; i++)
+ run_with_hugetlb(test_case->fn, test_case->desc,
+ hugetlbsizes[i]);
+}
+
+static void run_anon_test_cases(void)
+{
+ int i;
+
+ ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
+
+ for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
+ run_anon_test_case(&anon_test_cases[i]);
+}
+
+static int tests_per_anon_test_case(void)
+{
+ int tests = 2 + nr_hugetlbsizes;
+
+ tests += 6 * nr_thpsizes;
+ if (pmdsize)
+ tests += 2;
+ return tests;
+}
+
+enum anon_thp_collapse_test {
+ ANON_THP_COLLAPSE_UNSHARED,
+ ANON_THP_COLLAPSE_FULLY_SHARED,
+ ANON_THP_COLLAPSE_LOWER_SHARED,
+ ANON_THP_COLLAPSE_UPPER_SHARED,
+};
+
+static void do_test_anon_thp_collapse(char *mem, size_t size,
+ enum anon_thp_collapse_test test)
+{
+ struct comm_pipes comm_pipes;
+ char buf;
+ int ret;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ /*
+ * Trigger PTE-mapping the THP by temporarily mapping a single subpage
+ * R/O, such that we can try collapsing it later.
+ */
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_perror("mprotect() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+
+ switch (test) {
+ case ANON_THP_COLLAPSE_UNSHARED:
+ /* Collapse before actually COW-sharing the page. */
+ ret = madvise(mem, size, MADV_COLLAPSE);
+ if (ret) {
+ ksft_perror("MADV_COLLAPSE failed");
+ log_test_result(KSFT_SKIP);
+ goto close_comm_pipes;
+ }
+ break;
+ case ANON_THP_COLLAPSE_FULLY_SHARED:
+ /* COW-share the full PTE-mapped THP. */
+ break;
+ case ANON_THP_COLLAPSE_LOWER_SHARED:
+ /* Don't COW-share the upper part of the THP. */
+ ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
+ if (ret) {
+ ksft_perror("MADV_DONTFORK failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+ break;
+ case ANON_THP_COLLAPSE_UPPER_SHARED:
+ /* Don't COW-share the lower part of the THP. */
+ ret = madvise(mem, size / 2, MADV_DONTFORK);
+ if (ret) {
+ ksft_perror("MADV_DONTFORK failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_perror("fork() failed");
+ log_test_result(KSFT_FAIL);
+ goto close_comm_pipes;
+ } else if (!ret) {
+ switch (test) {
+ case ANON_THP_COLLAPSE_UNSHARED:
+ case ANON_THP_COLLAPSE_FULLY_SHARED:
+ exit(child_memcmp_fn(mem, size, &comm_pipes));
+ break;
+ case ANON_THP_COLLAPSE_LOWER_SHARED:
+ exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
+ break;
+ case ANON_THP_COLLAPSE_UPPER_SHARED:
+ exit(child_memcmp_fn(mem + size / 2, size / 2,
+ &comm_pipes));
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+
+ switch (test) {
+ case ANON_THP_COLLAPSE_UNSHARED:
+ break;
+ case ANON_THP_COLLAPSE_UPPER_SHARED:
+ case ANON_THP_COLLAPSE_LOWER_SHARED:
+ /*
+ * Revert MADV_DONTFORK such that we merge the VMAs and are
+ * able to actually collapse.
+ */
+ ret = madvise(mem, size, MADV_DOFORK);
+ if (ret) {
+ ksft_perror("MADV_DOFORK failed");
+ log_test_result(KSFT_FAIL);
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+ /* FALLTHROUGH */
+ case ANON_THP_COLLAPSE_FULLY_SHARED:
+ /* Collapse before anyone modified the COW-shared page. */
+ ret = madvise(mem, size, MADV_COLLAPSE);
+ if (ret) {
+ ksft_perror("MADV_COLLAPSE failed");
+ log_test_result(KSFT_SKIP);
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+
+ if (!ret)
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_anon_thp_collapse_unshared(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ assert(!is_hugetlb);
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
+}
+
+static void test_anon_thp_collapse_fully_shared(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ assert(!is_hugetlb);
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
+}
+
+static void test_anon_thp_collapse_lower_shared(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ assert(!is_hugetlb);
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
+}
+
+static void test_anon_thp_collapse_upper_shared(char *mem, size_t size,
+ bool is_hugetlb)
+{
+ assert(!is_hugetlb);
+ do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
+}
+
+/*
+ * Test cases that are specific to anonymous THP: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_thp_test_cases[] = {
+ /*
+ * Basic COW test for fork() without any GUP when collapsing a THP
+ * before fork().
+ *
+ * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
+ * collapse") might easily get COW handling wrong when not collapsing
+ * exclusivity information properly.
+ */
+ {
+ "Basic COW after fork() when collapsing before fork()",
+ test_anon_thp_collapse_unshared,
+ },
+ /* Basic COW test, but collapse after COW-sharing a full THP. */
+ {
+ "Basic COW after fork() when collapsing after fork() (fully shared)",
+ test_anon_thp_collapse_fully_shared,
+ },
+ /*
+ * Basic COW test, but collapse after COW-sharing the lower half of a
+ * THP.
+ */
+ {
+ "Basic COW after fork() when collapsing after fork() (lower shared)",
+ test_anon_thp_collapse_lower_shared,
+ },
+ /*
+ * Basic COW test, but collapse after COW-sharing the upper half of a
+ * THP.
+ */
+ {
+ "Basic COW after fork() when collapsing after fork() (upper shared)",
+ test_anon_thp_collapse_upper_shared,
+ },
+};
+
+static void run_anon_thp_test_cases(void)
+{
+ int i;
+
+ if (!pmdsize)
+ return;
+
+ ksft_print_msg("[INFO] Anonymous THP tests\n");
+
+ for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
+ struct test_case const *test_case = &anon_thp_test_cases[i];
+
+ log_test_start("%s", test_case->desc);
+ do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
+ }
+}
+
+static int tests_per_anon_thp_test_case(void)
+{
+ return pmdsize ? 1 : 0;
+}
+
+typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
+
+static void test_cow(char *mem, const char *smem, size_t size)
+{
+ char *old = malloc(size);
+
+ /* Backup the original content. */
+ memcpy(old, smem, size);
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+
+ /* See if we still read the old values via the other mapping. */
+ if (!memcmp(smem, old, size))
+ log_test_result(KSFT_PASS);
+ else
+ log_test_result(KSFT_FAIL);
+ free(old);
+}
+
+static void test_ro_pin(char *mem, const char *smem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST, false);
+}
+
+static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST, true);
+}
+
+static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
+{
+ char *mem, *smem, tmp;
+
+ log_test_start("%s ... with shared zeropage", desc);
+
+ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (smem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /* Read from the page to populate the shared zeropage. */
+ tmp = *mem + *smem;
+ asm volatile("" : "+r" (tmp));
+
+ fn(mem, smem, pagesize);
+munmap:
+ munmap(mem, pagesize);
+ if (smem != MAP_FAILED)
+ munmap(smem, pagesize);
+}
+
+static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
+{
+ char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
+ size_t mmap_size;
+ int ret;
+
+ log_test_start("%s ... with huge zeropage", desc);
+
+ if (!has_huge_zeropage) {
+ ksft_print_msg("Huge zeropage not enabled\n");
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ /* For alignment purposes, we need twice the thp size. */
+ mmap_size = 2 * pmdsize;
+ mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mmap_mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+ mmap_smem = mmap(NULL, mmap_size, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mmap_smem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /* We need a THP-aligned memory area. */
+ mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1));
+ smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
+
+ ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
+ if (ret != 0) {
+ ksft_perror("madvise()");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+ ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
+ if (ret != 0) {
+ ksft_perror("madvise()");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /*
+ * Read from the memory to populate the huge shared zeropage. Read from
+ * the first sub-page and test if we get another sub-page populated
+ * automatically.
+ */
+ tmp = *mem + *smem;
+ asm volatile("" : "+r" (tmp));
+ if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
+ !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
+ ksft_test_result_skip("Did not get THPs populated\n");
+ goto munmap;
+ }
+
+ fn(mem, smem, pmdsize);
+munmap:
+ munmap(mmap_mem, mmap_size);
+ if (mmap_smem != MAP_FAILED)
+ munmap(mmap_smem, mmap_size);
+}
+
+static void run_with_memfd(non_anon_test_fn fn, const char *desc)
+{
+ char *mem, *smem, tmp;
+ int fd;
+
+ log_test_start("%s ... with memfd", desc);
+
+ fd = memfd_create("test", 0);
+ if (fd < 0) {
+ ksft_perror("memfd_create() failed");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ /* File consists of a single page filled with zeroes. */
+ if (fallocate(fd, 0, 0, pagesize)) {
+ ksft_perror("fallocate() failed");
+ log_test_result(KSFT_FAIL);
+ goto close;
+ }
+
+ /* Create a private mapping of the memfd. */
+ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto close;
+ }
+ smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
+ if (smem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /* Fault the page in. */
+ tmp = *mem + *smem;
+ asm volatile("" : "+r" (tmp));
+
+ fn(mem, smem, pagesize);
+munmap:
+ munmap(mem, pagesize);
+ if (smem != MAP_FAILED)
+ munmap(smem, pagesize);
+close:
+ close(fd);
+}
+
+static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
+{
+ char *mem, *smem, tmp;
+ FILE *file;
+ int fd;
+
+ log_test_start("%s ... with tmpfile", desc);
+
+ file = tmpfile();
+ if (!file) {
+ ksft_perror("tmpfile() failed");
+ log_test_result(KSFT_FAIL);
+ return;
+ }
+
+ fd = fileno(file);
+ if (fd < 0) {
+ ksft_perror("fileno() failed");
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ /* File consists of a single page filled with zeroes. */
+ if (fallocate(fd, 0, 0, pagesize)) {
+ ksft_perror("fallocate() failed");
+ log_test_result(KSFT_FAIL);
+ goto close;
+ }
+
+ /* Create a private mapping of the memfd. */
+ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (mem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto close;
+ }
+ smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
+ if (smem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /* Fault the page in. */
+ tmp = *mem + *smem;
+ asm volatile("" : "+r" (tmp));
+
+ fn(mem, smem, pagesize);
+munmap:
+ munmap(mem, pagesize);
+ if (smem != MAP_FAILED)
+ munmap(smem, pagesize);
+close:
+ fclose(file);
+}
+
+static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
+ size_t hugetlbsize)
+{
+ int flags = MFD_HUGETLB;
+ char *mem, *smem, tmp;
+ int fd;
+
+ log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
+ hugetlbsize / 1024);
+
+ flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
+
+ fd = memfd_create("test", flags);
+ if (fd < 0) {
+ ksft_perror("memfd_create() failed");
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ /* File consists of a single page filled with zeroes. */
+ if (fallocate(fd, 0, 0, hugetlbsize)) {
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
+ goto close;
+ }
+
+ /* Create a private mapping of the memfd. */
+ mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
+ 0);
+ if (mem == MAP_FAILED) {
+ ksft_perror("need more free huge pages");
+ log_test_result(KSFT_SKIP);
+ goto close;
+ }
+ smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
+ if (smem == MAP_FAILED) {
+ ksft_perror("mmap() failed");
+ log_test_result(KSFT_FAIL);
+ goto munmap;
+ }
+
+ /* Fault the page in. */
+ tmp = *mem + *smem;
+ asm volatile("" : "+r" (tmp));
+
+ fn(mem, smem, hugetlbsize);
+munmap:
+ munmap(mem, hugetlbsize);
+ if (smem != MAP_FAILED)
+ munmap(smem, hugetlbsize);
+close:
+ close(fd);
+}
+
+struct non_anon_test_case {
+ const char *desc;
+ non_anon_test_fn fn;
+};
+
+/*
+ * Test cases that target any pages in private mappings that are not anonymous:
+ * pages that may get shared via COW ndependent of fork(). This includes
+ * the shared zeropage(s), pagecache pages, ...
+ */
+static const struct non_anon_test_case non_anon_test_cases[] = {
+ /*
+ * Basic COW test without any GUP. If we miss to break COW, changes are
+ * visible via other private/shared mappings.
+ */
+ {
+ "Basic COW",
+ test_cow,
+ },
+ /*
+ * Take a R/O longterm pin. When modifying the page via the page table,
+ * the page content change must be visible via the pin.
+ */
+ {
+ "R/O longterm GUP pin",
+ test_ro_pin,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O longterm GUP-fast pin",
+ test_ro_fast_pin,
+ },
+};
+
+static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
+{
+ int i;
+
+ run_with_zeropage(test_case->fn, test_case->desc);
+ run_with_memfd(test_case->fn, test_case->desc);
+ run_with_tmpfile(test_case->fn, test_case->desc);
+ if (pmdsize)
+ run_with_huge_zeropage(test_case->fn, test_case->desc);
+ for (i = 0; i < nr_hugetlbsizes; i++)
+ run_with_memfd_hugetlb(test_case->fn, test_case->desc,
+ hugetlbsizes[i]);
+}
+
+static void run_non_anon_test_cases(void)
+{
+ int i;
+
+ ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
+
+ for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
+ run_non_anon_test_case(&non_anon_test_cases[i]);
+}
+
+static int tests_per_non_anon_test_case(void)
+{
+ int tests = 3 + nr_hugetlbsizes;
+
+ if (pmdsize)
+ tests += 1;
+ return tests;
+}
+
+int main(int argc, char **argv)
+{
+ struct thp_settings default_settings;
+
+ ksft_print_header();
+
+ pagesize = getpagesize();
+ pmdsize = read_pmd_pagesize();
+ if (pmdsize) {
+ /* Only if THP is supported. */
+ thp_read_settings(&default_settings);
+ default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT;
+ thp_save_settings();
+ thp_push_settings(&default_settings);
+
+ ksft_print_msg("[INFO] detected PMD size: %zu KiB\n",
+ pmdsize / 1024);
+ nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
+ }
+ nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
+ ARRAY_SIZE(hugetlbsizes));
+ detect_huge_zeropage();
+
+ ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
+ ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
+ ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
+
+ gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+
+ run_anon_test_cases();
+ run_anon_thp_test_cases();
+ run_non_anon_test_cases();
+
+ if (pmdsize) {
+ /* Only if THP is supported. */
+ thp_restore_settings();
+ }
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c
new file mode 100644
index 000000000000..f3d9ecf96890
--- /dev/null
+++ b/tools/testing/selftests/mm/droppable.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+
+#include "../kselftest.h"
+
+int main(int argc, char *argv[])
+{
+ size_t alloc_size = 134217728;
+ size_t page_size = getpagesize();
+ void *alloc;
+ pid_t child;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
+ assert(alloc != MAP_FAILED);
+ memset(alloc, 'A', alloc_size);
+ for (size_t i = 0; i < alloc_size; i += page_size)
+ assert(*(uint8_t *)(alloc + i));
+
+ child = fork();
+ assert(child >= 0);
+ if (!child) {
+ for (;;)
+ *(char *)malloc(page_size) = 'B';
+ }
+
+ for (bool done = false; !done;) {
+ for (size_t i = 0; i < alloc_size; i += page_size) {
+ if (!*(uint8_t *)(alloc + i)) {
+ done = true;
+ break;
+ }
+ }
+ }
+ kill(child, SIGTERM);
+
+ ksft_test_result_pass("MAP_DROPPABLE: PASS\n");
+ exit(KSFT_PASS);
+}
diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c
new file mode 100644
index 000000000000..93af3d3760f9
--- /dev/null
+++ b/tools/testing/selftests/mm/guard-regions.c
@@ -0,0 +1,2148 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <asm-generic/mman.h> /* Force the import of the tools version. */
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <linux/userfaultfd.h>
+#include <linux/fs.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include "vm_util.h"
+
+#include "../pidfd/pidfd.h"
+
+/*
+ * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
+ *
+ * "If the signal occurs other than as the result of calling the abort or raise
+ * function, the behavior is undefined if the signal handler refers to any
+ * object with static storage duration other than by assigning a value to an
+ * object declared as volatile sig_atomic_t"
+ */
+static volatile sig_atomic_t signal_jump_set;
+static sigjmp_buf signal_jmp_buf;
+
+/*
+ * Ignore the checkpatch warning, we must read from x but don't want to do
+ * anything with it in order to trigger a read page fault. We therefore must use
+ * volatile to stop the compiler from optimising this away.
+ */
+#define FORCE_READ(x) (*(volatile typeof(x) *)x)
+
+/*
+ * How is the test backing the mapping being tested?
+ */
+enum backing_type {
+ ANON_BACKED,
+ SHMEM_BACKED,
+ LOCAL_FILE_BACKED,
+};
+
+FIXTURE(guard_regions)
+{
+ unsigned long page_size;
+ char path[PATH_MAX];
+ int fd;
+};
+
+FIXTURE_VARIANT(guard_regions)
+{
+ enum backing_type backing;
+};
+
+FIXTURE_VARIANT_ADD(guard_regions, anon)
+{
+ .backing = ANON_BACKED,
+};
+
+FIXTURE_VARIANT_ADD(guard_regions, shmem)
+{
+ .backing = SHMEM_BACKED,
+};
+
+FIXTURE_VARIANT_ADD(guard_regions, file)
+{
+ .backing = LOCAL_FILE_BACKED,
+};
+
+static bool is_anon_backed(const FIXTURE_VARIANT(guard_regions) * variant)
+{
+ switch (variant->backing) {
+ case ANON_BACKED:
+ case SHMEM_BACKED:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void *mmap_(FIXTURE_DATA(guard_regions) * self,
+ const FIXTURE_VARIANT(guard_regions) * variant,
+ void *addr, size_t length, int prot, int extra_flags,
+ off_t offset)
+{
+ int fd;
+ int flags = extra_flags;
+
+ switch (variant->backing) {
+ case ANON_BACKED:
+ flags |= MAP_PRIVATE | MAP_ANON;
+ fd = -1;
+ break;
+ case SHMEM_BACKED:
+ case LOCAL_FILE_BACKED:
+ flags |= MAP_SHARED;
+ fd = self->fd;
+ break;
+ default:
+ ksft_exit_fail();
+ break;
+ }
+
+ return mmap(addr, length, prot, flags, fd, offset);
+}
+
+static int userfaultfd(int flags)
+{
+ return syscall(SYS_userfaultfd, flags);
+}
+
+static void handle_fatal(int c)
+{
+ if (!signal_jump_set)
+ return;
+
+ siglongjmp(signal_jmp_buf, c);
+}
+
+static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
+ size_t n, int advice, unsigned int flags)
+{
+ return syscall(__NR_process_madvise, pidfd, iovec, n, advice, flags);
+}
+
+/*
+ * Enable our signal catcher and try to read/write the specified buffer. The
+ * return value indicates whether the read/write succeeds without a fatal
+ * signal.
+ */
+static bool try_access_buf(char *ptr, bool write)
+{
+ bool failed;
+
+ /* Tell signal handler to jump back here on fatal signal. */
+ signal_jump_set = true;
+ /* If a fatal signal arose, we will jump back here and failed is set. */
+ failed = sigsetjmp(signal_jmp_buf, 0) != 0;
+
+ if (!failed) {
+ if (write)
+ *ptr = 'x';
+ else
+ FORCE_READ(ptr);
+ }
+
+ signal_jump_set = false;
+ return !failed;
+}
+
+/* Try and read from a buffer, return true if no fatal signal. */
+static bool try_read_buf(char *ptr)
+{
+ return try_access_buf(ptr, false);
+}
+
+/* Try and write to a buffer, return true if no fatal signal. */
+static bool try_write_buf(char *ptr)
+{
+ return try_access_buf(ptr, true);
+}
+
+/*
+ * Try and BOTH read from AND write to a buffer, return true if BOTH operations
+ * succeed.
+ */
+static bool try_read_write_buf(char *ptr)
+{
+ return try_read_buf(ptr) && try_write_buf(ptr);
+}
+
+static void setup_sighandler(void)
+{
+ struct sigaction act = {
+ .sa_handler = &handle_fatal,
+ .sa_flags = SA_NODEFER,
+ };
+
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGSEGV, &act, NULL))
+ ksft_exit_fail_perror("sigaction");
+}
+
+static void teardown_sighandler(void)
+{
+ struct sigaction act = {
+ .sa_handler = SIG_DFL,
+ .sa_flags = SA_NODEFER,
+ };
+
+ sigemptyset(&act.sa_mask);
+ sigaction(SIGSEGV, &act, NULL);
+}
+
+static int open_file(const char *prefix, char *path)
+{
+ int fd;
+
+ snprintf(path, PATH_MAX, "%sguard_regions_test_file_XXXXXX", prefix);
+ fd = mkstemp(path);
+ if (fd < 0)
+ ksft_exit_fail_perror("mkstemp");
+
+ return fd;
+}
+
+/* Establish a varying pattern in a buffer. */
+static void set_pattern(char *ptr, size_t num_pages, size_t page_size)
+{
+ size_t i;
+
+ for (i = 0; i < num_pages; i++) {
+ char *ptr2 = &ptr[i * page_size];
+
+ memset(ptr2, 'a' + (i % 26), page_size);
+ }
+}
+
+/*
+ * Check that a buffer contains the pattern set by set_pattern(), starting at a
+ * page offset of pgoff within the buffer.
+ */
+static bool check_pattern_offset(char *ptr, size_t num_pages, size_t page_size,
+ size_t pgoff)
+{
+ size_t i;
+
+ for (i = 0; i < num_pages * page_size; i++) {
+ size_t offset = pgoff * page_size + i;
+ char actual = ptr[offset];
+ char expected = 'a' + ((offset / page_size) % 26);
+
+ if (actual != expected)
+ return false;
+ }
+
+ return true;
+}
+
+/* Check that a buffer contains the pattern set by set_pattern(). */
+static bool check_pattern(char *ptr, size_t num_pages, size_t page_size)
+{
+ return check_pattern_offset(ptr, num_pages, page_size, 0);
+}
+
+/* Determine if a buffer contains only repetitions of a specified char. */
+static bool is_buf_eq(char *buf, size_t size, char chr)
+{
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ if (buf[i] != chr)
+ return false;
+ }
+
+ return true;
+}
+
+FIXTURE_SETUP(guard_regions)
+{
+ self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+ setup_sighandler();
+
+ switch (variant->backing) {
+ case ANON_BACKED:
+ return;
+ case LOCAL_FILE_BACKED:
+ self->fd = open_file("", self->path);
+ break;
+ case SHMEM_BACKED:
+ self->fd = memfd_create(self->path, 0);
+ break;
+ }
+
+ /* We truncate file to at least 100 pages, tests can modify as needed. */
+ ASSERT_EQ(ftruncate(self->fd, 100 * self->page_size), 0);
+};
+
+FIXTURE_TEARDOWN_PARENT(guard_regions)
+{
+ teardown_sighandler();
+
+ if (variant->backing == ANON_BACKED)
+ return;
+
+ if (self->fd >= 0)
+ close(self->fd);
+
+ if (self->path[0] != '\0')
+ unlink(self->path);
+}
+
+TEST_F(guard_regions, basic)
+{
+ const unsigned long NUM_PAGES = 10;
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap_(self, variant, NULL, NUM_PAGES * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Trivially assert we can touch the first page. */
+ ASSERT_TRUE(try_read_write_buf(ptr));
+
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Establish that 1st page SIGSEGV's. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+
+ /* Ensure we can touch everything else.*/
+ for (i = 1; i < NUM_PAGES; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Establish a guard page at the end of the mapping. */
+ ASSERT_EQ(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /* Check that both guard pages result in SIGSEGV. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size]));
+
+ /* Remove the first guard page. */
+ ASSERT_FALSE(madvise(ptr, page_size, MADV_GUARD_REMOVE));
+
+ /* Make sure we can touch it. */
+ ASSERT_TRUE(try_read_write_buf(ptr));
+
+ /* Remove the last guard page. */
+ ASSERT_FALSE(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size,
+ MADV_GUARD_REMOVE));
+
+ /* Make sure we can touch it. */
+ ASSERT_TRUE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size]));
+
+ /*
+ * Test setting a _range_ of pages, namely the first 3. The first of
+ * these be faulted in, so this also tests that we can install guard
+ * pages over backed pages.
+ */
+ ASSERT_EQ(madvise(ptr, 3 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure they are all guard pages. */
+ for (i = 0; i < 3; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Make sure the rest are not. */
+ for (i = 3; i < NUM_PAGES; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Remove guard pages. */
+ ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Now make sure we can touch everything. */
+ for (i = 0; i < NUM_PAGES; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /*
+ * Now remove all guard pages, make sure we don't remove existing
+ * entries.
+ */
+ ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0);
+
+ for (i = 0; i < NUM_PAGES * page_size; i += page_size) {
+ char chr = ptr[i];
+
+ ASSERT_EQ(chr, 'x');
+ }
+
+ ASSERT_EQ(munmap(ptr, NUM_PAGES * page_size), 0);
+}
+
+/* Assert that operations applied across multiple VMAs work as expected. */
+TEST_F(guard_regions, multi_vma)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr_region, *ptr, *ptr1, *ptr2, *ptr3;
+ int i;
+
+ /* Reserve a 100 page region over which we can install VMAs. */
+ ptr_region = mmap_(self, variant, NULL, 100 * page_size,
+ PROT_NONE, 0, 0);
+ ASSERT_NE(ptr_region, MAP_FAILED);
+
+ /* Place a VMA of 10 pages size at the start of the region. */
+ ptr1 = mmap_(self, variant, ptr_region, 10 * page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+
+ /* Place a VMA of 5 pages size 50 pages into the region. */
+ ptr2 = mmap_(self, variant, &ptr_region[50 * page_size], 5 * page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ /* Place a VMA of 20 pages size at the end of the region. */
+ ptr3 = mmap_(self, variant, &ptr_region[80 * page_size], 20 * page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr3, MAP_FAILED);
+
+ /* Unmap gaps. */
+ ASSERT_EQ(munmap(&ptr_region[10 * page_size], 40 * page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[55 * page_size], 25 * page_size), 0);
+
+ /*
+ * We end up with VMAs like this:
+ *
+ * 0 10 .. 50 55 .. 80 100
+ * [---] [---] [---]
+ */
+
+ /*
+ * Now mark the whole range as guard pages and make sure all VMAs are as
+ * such.
+ */
+
+ /*
+ * madvise() is certifiable and lets you perform operations over gaps,
+ * everything works, but it indicates an error and errno is set to
+ * -ENOMEM. Also if anything runs out of memory it is set to
+ * -ENOMEM. You are meant to guess which is which.
+ */
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), -1);
+ ASSERT_EQ(errno, ENOMEM);
+
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr1[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 5; i++) {
+ char *curr = &ptr2[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 20; i++) {
+ char *curr = &ptr3[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now remove guar pages over range and assert the opposite. */
+
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), -1);
+ ASSERT_EQ(errno, ENOMEM);
+
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr1[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 5; i++) {
+ char *curr = &ptr2[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ for (i = 0; i < 20; i++) {
+ char *curr = &ptr3[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Now map incompatible VMAs in the gaps. */
+ ptr = mmap_(self, variant, &ptr_region[10 * page_size], 40 * page_size,
+ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ ptr = mmap_(self, variant, &ptr_region[55 * page_size], 25 * page_size,
+ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * We end up with VMAs like this:
+ *
+ * 0 10 .. 50 55 .. 80 100
+ * [---][xxxx][---][xxxx][---]
+ *
+ * Where 'x' signifies VMAs that cannot be merged with those adjacent to
+ * them.
+ */
+
+ /* Multiple VMAs adjacent to one another should result in no error. */
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), 0);
+ for (i = 0; i < 100; i++) {
+ char *curr = &ptr_region[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), 0);
+ for (i = 0; i < 100; i++) {
+ char *curr = &ptr_region[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr_region, 100 * page_size), 0);
+}
+
+/*
+ * Assert that batched operations performed using process_madvise() work as
+ * expected.
+ */
+TEST_F(guard_regions, process_madvise)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr_region, *ptr1, *ptr2, *ptr3;
+ ssize_t count;
+ struct iovec vec[6];
+
+ /* Reserve region to map over. */
+ ptr_region = mmap_(self, variant, NULL, 100 * page_size,
+ PROT_NONE, 0, 0);
+ ASSERT_NE(ptr_region, MAP_FAILED);
+
+ /*
+ * 10 pages offset 1 page into reserve region. We MAP_POPULATE so we
+ * overwrite existing entries and test this code path against
+ * overwriting existing entries.
+ */
+ ptr1 = mmap_(self, variant, &ptr_region[page_size], 10 * page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED | MAP_POPULATE, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+ /* We want guard markers at start/end of each VMA. */
+ vec[0].iov_base = ptr1;
+ vec[0].iov_len = page_size;
+ vec[1].iov_base = &ptr1[9 * page_size];
+ vec[1].iov_len = page_size;
+
+ /* 5 pages offset 50 pages into reserve region. */
+ ptr2 = mmap_(self, variant, &ptr_region[50 * page_size], 5 * page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+ vec[2].iov_base = ptr2;
+ vec[2].iov_len = page_size;
+ vec[3].iov_base = &ptr2[4 * page_size];
+ vec[3].iov_len = page_size;
+
+ /* 20 pages offset 79 pages into reserve region. */
+ ptr3 = mmap_(self, variant, &ptr_region[79 * page_size], 20 * page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr3, MAP_FAILED);
+ vec[4].iov_base = ptr3;
+ vec[4].iov_len = page_size;
+ vec[5].iov_base = &ptr3[19 * page_size];
+ vec[5].iov_len = page_size;
+
+ /* Free surrounding VMAs. */
+ ASSERT_EQ(munmap(ptr_region, page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[11 * page_size], 39 * page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[55 * page_size], 24 * page_size), 0);
+ ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0);
+
+ /* Now guard in one step. */
+ count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_INSTALL, 0);
+
+ /* OK we don't have permission to do this, skip. */
+ if (count == -1 && errno == EPERM)
+ ksft_exit_skip("No process_madvise() permissions, try running as root.\n");
+
+ /* Returns the number of bytes advised. */
+ ASSERT_EQ(count, 6 * page_size);
+
+ /* Now make sure the guarding was applied. */
+
+ ASSERT_FALSE(try_read_write_buf(ptr1));
+ ASSERT_FALSE(try_read_write_buf(&ptr1[9 * page_size]));
+
+ ASSERT_FALSE(try_read_write_buf(ptr2));
+ ASSERT_FALSE(try_read_write_buf(&ptr2[4 * page_size]));
+
+ ASSERT_FALSE(try_read_write_buf(ptr3));
+ ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size]));
+
+ /* Now do the same with unguard... */
+ count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_REMOVE, 0);
+
+ /* ...and everything should now succeed. */
+
+ ASSERT_TRUE(try_read_write_buf(ptr1));
+ ASSERT_TRUE(try_read_write_buf(&ptr1[9 * page_size]));
+
+ ASSERT_TRUE(try_read_write_buf(ptr2));
+ ASSERT_TRUE(try_read_write_buf(&ptr2[4 * page_size]));
+
+ ASSERT_TRUE(try_read_write_buf(ptr3));
+ ASSERT_TRUE(try_read_write_buf(&ptr3[19 * page_size]));
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr1, 10 * page_size), 0);
+ ASSERT_EQ(munmap(ptr2, 5 * page_size), 0);
+ ASSERT_EQ(munmap(ptr3, 20 * page_size), 0);
+}
+
+/* Assert that unmapping ranges does not leave guard markers behind. */
+TEST_F(guard_regions, munmap)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new1, *ptr_new2;
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard first and last pages. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[9 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Assert that they are guarded. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[9 * page_size]));
+
+ /* Unmap them. */
+ ASSERT_EQ(munmap(ptr, page_size), 0);
+ ASSERT_EQ(munmap(&ptr[9 * page_size], page_size), 0);
+
+ /* Map over them.*/
+ ptr_new1 = mmap_(self, variant, ptr, page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED, 0);
+ ASSERT_NE(ptr_new1, MAP_FAILED);
+ ptr_new2 = mmap_(self, variant, &ptr[9 * page_size], page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr_new2, MAP_FAILED);
+
+ /* Assert that they are now not guarded. */
+ ASSERT_TRUE(try_read_write_buf(ptr_new1));
+ ASSERT_TRUE(try_read_write_buf(ptr_new2));
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that mprotect() operations have no bearing on guard markers. */
+TEST_F(guard_regions, mprotect)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard the middle of the range. */
+ ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /* Assert that it is indeed guarded. */
+ ASSERT_FALSE(try_read_write_buf(&ptr[5 * page_size]));
+ ASSERT_FALSE(try_read_write_buf(&ptr[6 * page_size]));
+
+ /* Now make these pages read-only. */
+ ASSERT_EQ(mprotect(&ptr[5 * page_size], 2 * page_size, PROT_READ), 0);
+
+ /* Make sure the range is still guarded. */
+ ASSERT_FALSE(try_read_buf(&ptr[5 * page_size]));
+ ASSERT_FALSE(try_read_buf(&ptr[6 * page_size]));
+
+ /* Make sure we can guard again without issue.*/
+ ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the range is, yet again, still guarded. */
+ ASSERT_FALSE(try_read_buf(&ptr[5 * page_size]));
+ ASSERT_FALSE(try_read_buf(&ptr[6 * page_size]));
+
+ /* Now unguard the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Make sure the whole range is readable. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Split and merge VMAs and make sure guard pages still behave. */
+TEST_F(guard_regions, split_merge)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new;
+ int i;
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the whole range is guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now unmap some pages in the range so we split. */
+ ASSERT_EQ(munmap(&ptr[2 * page_size], page_size), 0);
+ ASSERT_EQ(munmap(&ptr[5 * page_size], page_size), 0);
+ ASSERT_EQ(munmap(&ptr[8 * page_size], page_size), 0);
+
+ /* Make sure the remaining ranges are guarded post-split. */
+ for (i = 0; i < 2; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ for (i = 2; i < 5; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ for (i = 6; i < 8; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+ for (i = 9; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now map them again - the unmap will have cleared the guards. */
+ ptr_new = mmap_(self, variant, &ptr[2 * page_size], page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+ ptr_new = mmap_(self, variant, &ptr[5 * page_size], page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+ ptr_new = mmap_(self, variant, &ptr[8 * page_size], page_size,
+ PROT_READ | PROT_WRITE, MAP_FIXED, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+
+ /* Now make sure guard pages are established. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+ bool expect_true = i == 2 || i == 5 || i == 8;
+
+ ASSERT_TRUE(expect_true ? result : !result);
+ }
+
+ /* Now guard everything again. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the whole range is guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now split the range into three. */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Make sure the whole range is guarded for read. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_buf(curr));
+ }
+
+ /* Now reset protection bits so we merge the whole thing. */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size,
+ PROT_READ | PROT_WRITE), 0);
+
+ /* Make sure the whole range is still guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Split range into 3 again... */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* ...and unguard the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Make sure the whole range is remedied for read. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_buf(curr));
+ }
+
+ /* Merge them again. */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0);
+ ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size,
+ PROT_READ | PROT_WRITE), 0);
+
+ /* Now ensure the merged range is remedied for read/write. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that MADV_DONTNEED does not remove guard markers. */
+TEST_F(guard_regions, dontneed)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Back the whole range. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ *curr = 'y';
+ }
+
+ /* Guard every other page. */
+ for (i = 0; i < 10; i += 2) {
+ char *curr = &ptr[i * page_size];
+ int res = madvise(curr, page_size, MADV_GUARD_INSTALL);
+
+ ASSERT_EQ(res, 0);
+ }
+
+ /* Indicate that we don't need any of the range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_DONTNEED), 0);
+
+ /* Check to ensure guard markers are still in place. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_buf(curr);
+
+ if (i % 2 == 0) {
+ ASSERT_FALSE(result);
+ } else {
+ ASSERT_TRUE(result);
+ switch (variant->backing) {
+ case ANON_BACKED:
+ /* If anon, then we get a zero page. */
+ ASSERT_EQ(*curr, '\0');
+ break;
+ default:
+ /* Otherwise, we get the file data. */
+ ASSERT_EQ(*curr, 'y');
+ break;
+ }
+ }
+
+ /* Now write... */
+ result = try_write_buf(&ptr[i * page_size]);
+
+ /* ...and make sure same result. */
+ ASSERT_TRUE(i % 2 != 0 ? result : !result);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Assert that mlock()'ed pages work correctly with guard markers. */
+TEST_F(guard_regions, mlock)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Populate. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ *curr = 'y';
+ }
+
+ /* Lock. */
+ ASSERT_EQ(mlock(ptr, 10 * page_size), 0);
+
+ /* Now try to guard, should fail with EINVAL. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), -1);
+ ASSERT_EQ(errno, EINVAL);
+
+ /* OK unlock. */
+ ASSERT_EQ(munlock(ptr, 10 * page_size), 0);
+
+ /* Guard first half of range, should now succeed. */
+ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure guard works. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ if (i < 5) {
+ ASSERT_FALSE(result);
+ } else {
+ ASSERT_TRUE(result);
+ ASSERT_EQ(*curr, 'x');
+ }
+ }
+
+ /*
+ * Now lock the latter part of the range. We can't lock the guard pages,
+ * as this would result in the pages being populated and the guarding
+ * would cause this to error out.
+ */
+ ASSERT_EQ(mlock(&ptr[5 * page_size], 5 * page_size), 0);
+
+ /*
+ * Now remove guard pages, we permit mlock()'d ranges to have guard
+ * pages removed as it is a non-destructive operation.
+ */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Now check that no guard pages remain. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * - Moving a mapping alone should retain markers as they are.
+ */
+TEST_F(guard_regions, mremap_move)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new;
+
+ /* Map 5 pages. */
+ ptr = mmap_(self, variant, NULL, 5 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Place guard markers at both ends of the 5 page span. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the guard pages are in effect. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Map a new region we will move this range into. Doing this ensures
+ * that we have reserved a range to map into.
+ */
+ ptr_new = mmap_(self, variant, NULL, 5 * page_size, PROT_NONE, 0, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+
+ ASSERT_EQ(mremap(ptr, 5 * page_size, 5 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new), ptr_new);
+
+ /* Make sure the guard markers are retained. */
+ ASSERT_FALSE(try_read_write_buf(ptr_new));
+ ASSERT_FALSE(try_read_write_buf(&ptr_new[4 * page_size]));
+
+ /*
+ * Clean up - we only need reference the new pointer as we overwrote the
+ * PROT_NONE range and moved the existing one.
+ */
+ munmap(ptr_new, 5 * page_size);
+}
+
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * Expanding should retain guard pages, only now in different position. The user
+ * will have to remove guard pages manually to fix up (they'd have to do the
+ * same if it were a PROT_NONE mapping).
+ */
+TEST_F(guard_regions, mremap_expand)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr, *ptr_new;
+
+ /* Map 10 pages... */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /* ...But unmap the last 5 so we can ensure we can expand into them. */
+ ASSERT_EQ(munmap(&ptr[5 * page_size], 5 * page_size), 0);
+
+ /* Place guard markers at both ends of the 5 page span. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the guarding is in effect. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Now expand to 10 pages. */
+ ptr = mremap(ptr, 5 * page_size, 10 * page_size, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Make sure the guard markers are retained in their original positions.
+ */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Reserve a region which we can move to and expand into. */
+ ptr_new = mmap_(self, variant, NULL, 20 * page_size, PROT_NONE, 0, 0);
+ ASSERT_NE(ptr_new, MAP_FAILED);
+
+ /* Now move and expand into it. */
+ ptr = mremap(ptr, 10 * page_size, 20 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new);
+ ASSERT_EQ(ptr, ptr_new);
+
+ /*
+ * Again, make sure the guard markers are retained in their original positions.
+ */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /*
+ * A real user would have to remove guard markers, but would reasonably
+ * expect all characteristics of the mapping to be retained, including
+ * guard markers.
+ */
+
+ /* Cleanup. */
+ munmap(ptr, 20 * page_size);
+}
+/*
+ * Assert that moving, extending and shrinking memory via mremap() retains
+ * guard markers where possible.
+ *
+ * Shrinking will result in markers that are shrunk over being removed. Again,
+ * if the user were using a PROT_NONE mapping they'd have to manually fix this
+ * up also so this is OK.
+ */
+TEST_F(guard_regions, mremap_shrink)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ /* Map 5 pages. */
+ ptr = mmap_(self, variant, NULL, 5 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Place guard markers at both ends of the 5 page span. */
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Make sure the guarding is in effect. */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+ ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size]));
+
+ /* Now shrink to 3 pages. */
+ ptr = mremap(ptr, 5 * page_size, 3 * page_size, MREMAP_MAYMOVE);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* We expect the guard marker at the start to be retained... */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+
+ /* ...But remaining pages will not have guard markers. */
+ for (i = 1; i < 3; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /*
+ * As with expansion, a real user would have to remove guard pages and
+ * fixup. But you'd have to do similar manual things with PROT_NONE
+ * mappings too.
+ */
+
+ /*
+ * If we expand back to the original size, the end marker will, of
+ * course, no longer be present.
+ */
+ ptr = mremap(ptr, 3 * page_size, 5 * page_size, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Again, we expect the guard marker at the start to be retained... */
+ ASSERT_FALSE(try_read_write_buf(ptr));
+
+ /* ...But remaining pages will not have guard markers. */
+ for (i = 1; i < 5; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ munmap(ptr, 5 * page_size);
+}
+
+/*
+ * Assert that forking a process with VMAs that do not have VM_WIPEONFORK set
+ * retain guard pages.
+ */
+TEST_F(guard_regions, fork)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ pid_t pid;
+ int i;
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Establish guard pages in the first 5 pages. */
+ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+ if (!pid) {
+ /* This is the child process now. */
+
+ /* Assert that the guarding is in effect. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ ASSERT_TRUE(i >= 5 ? result : !result);
+ }
+
+ /* Now unguard the range.*/
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ exit(0);
+ }
+
+ /* Parent process. */
+
+ /* Parent simply waits on child. */
+ waitpid(pid, NULL, 0);
+
+ /* Child unguard does not impact parent page table state. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ ASSERT_TRUE(i >= 5 ? result : !result);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert expected behaviour after we fork populated ranges of anonymous memory
+ * and then guard and unguard the range.
+ */
+TEST_F(guard_regions, fork_cow)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ pid_t pid;
+ int i;
+
+ if (variant->backing != ANON_BACKED)
+ SKIP(return, "CoW only supported on anon mappings");
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Populate range. */
+ for (i = 0; i < 10 * page_size; i++) {
+ char chr = 'a' + (i % 26);
+
+ ptr[i] = chr;
+ }
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+ if (!pid) {
+ /* This is the child process now. */
+
+ /* Ensure the range is as expected. */
+ for (i = 0; i < 10 * page_size; i++) {
+ char expected = 'a' + (i % 26);
+ char actual = ptr[i];
+
+ ASSERT_EQ(actual, expected);
+ }
+
+ /* Establish guard pages across the whole range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+ /* Remove it. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /*
+ * By removing the guard pages, the page tables will be
+ * cleared. Assert that we are looking at the zero page now.
+ */
+ for (i = 0; i < 10 * page_size; i++) {
+ char actual = ptr[i];
+
+ ASSERT_EQ(actual, '\0');
+ }
+
+ exit(0);
+ }
+
+ /* Parent process. */
+
+ /* Parent simply waits on child. */
+ waitpid(pid, NULL, 0);
+
+ /* Ensure the range is unchanged in parent anon range. */
+ for (i = 0; i < 10 * page_size; i++) {
+ char expected = 'a' + (i % 26);
+ char actual = ptr[i];
+
+ ASSERT_EQ(actual, expected);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that forking a process with VMAs that do have VM_WIPEONFORK set
+ * behave as expected.
+ */
+TEST_F(guard_regions, fork_wipeonfork)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ pid_t pid;
+ int i;
+
+ if (variant->backing != ANON_BACKED)
+ SKIP(return, "Wipe on fork only supported on anon mappings");
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Mark wipe on fork. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_WIPEONFORK), 0);
+
+ /* Guard the first 5 pages. */
+ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+ if (!pid) {
+ /* This is the child process now. */
+
+ /* Guard will have been wiped. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_write_buf(curr));
+ }
+
+ exit(0);
+ }
+
+ /* Parent process. */
+
+ waitpid(pid, NULL, 0);
+
+ /* Guard markers should be in effect.*/
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+ bool result = try_read_write_buf(curr);
+
+ ASSERT_TRUE(i >= 5 ? result : !result);
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_FREE retains guard entries as expected. */
+TEST_F(guard_regions, lazyfree)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (variant->backing != ANON_BACKED)
+ SKIP(return, "MADV_FREE only supported on anon mappings");
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Ensure guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Lazyfree range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_FREE), 0);
+
+ /* This should leave the guard markers in place. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_POPULATE_READ, MADV_POPULATE_WRITE behave as expected. */
+TEST_F(guard_regions, populate)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Populate read should error out... */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_READ), -1);
+ ASSERT_EQ(errno, EFAULT);
+
+ /* ...as should populate write. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_WRITE), -1);
+ ASSERT_EQ(errno, EFAULT);
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that MADV_COLD, MADV_PAGEOUT do not remove guard markers. */
+TEST_F(guard_regions, cold_pageout)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Guard range. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* Ensured guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Now mark cold. This should have no impact on guard markers. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_COLD), 0);
+
+ /* Should remain guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* OK, now page out. This should equally, have no effect on markers. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_PAGEOUT), 0);
+
+ /* Should remain guarded. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/* Ensure that guard pages do not break userfaultd. */
+TEST_F(guard_regions, uffd)
+{
+ const unsigned long page_size = self->page_size;
+ int uffd;
+ char *ptr;
+ int i;
+ struct uffdio_api api = {
+ .api = UFFD_API,
+ .features = 0,
+ };
+ struct uffdio_register reg;
+ struct uffdio_range range;
+
+ if (!is_anon_backed(variant))
+ SKIP(return, "uffd only works on anon backing");
+
+ /* Set up uffd. */
+ uffd = userfaultfd(0);
+ if (uffd == -1) {
+ switch (errno) {
+ case EPERM:
+ SKIP(return, "No userfaultfd permissions, try running as root.");
+ break;
+ case ENOSYS:
+ SKIP(return, "userfaultfd is not supported/not enabled.");
+ break;
+ default:
+ ksft_exit_fail_msg("userfaultfd failed with %s\n",
+ strerror(errno));
+ break;
+ }
+ }
+
+ ASSERT_NE(uffd, -1);
+
+ ASSERT_EQ(ioctl(uffd, UFFDIO_API, &api), 0);
+
+ /* Map 10 pages. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Register the range with uffd. */
+ range.start = (unsigned long)ptr;
+ range.len = 10 * page_size;
+ reg.range = range;
+ reg.mode = UFFDIO_REGISTER_MODE_MISSING;
+ ASSERT_EQ(ioctl(uffd, UFFDIO_REGISTER, &reg), 0);
+
+ /* Guard the range. This should not trigger the uffd. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /* The guarding should behave as usual with no uffd intervention. */
+ for (i = 0; i < 10; i++) {
+ char *curr = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_read_write_buf(curr));
+ }
+
+ /* Cleanup. */
+ ASSERT_EQ(ioctl(uffd, UFFDIO_UNREGISTER, &range), 0);
+ close(uffd);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Mark a region within a file-backed mapping using MADV_SEQUENTIAL so we
+ * aggressively read-ahead, then install guard regions and assert that it
+ * behaves correctly.
+ *
+ * We page out using MADV_PAGEOUT before checking guard regions so we drop page
+ * cache folios, meaning we maximise the possibility of some broken readahead.
+ */
+TEST_F(guard_regions, madvise_sequential)
+{
+ char *ptr;
+ int i;
+ const unsigned long page_size = self->page_size;
+
+ if (variant->backing == ANON_BACKED)
+ SKIP(return, "MADV_SEQUENTIAL meaningful only for file-backed");
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Establish a pattern of data in the file. */
+ set_pattern(ptr, 10, page_size);
+ ASSERT_TRUE(check_pattern(ptr, 10, page_size));
+
+ /* Mark it as being accessed sequentially. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_SEQUENTIAL), 0);
+
+ /* Mark every other page a guard page. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr2 = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr2, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Now page it out. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_PAGEOUT), 0);
+
+ /* Now make sure pages are as expected. */
+ for (i = 0; i < 10; i++) {
+ char *chrp = &ptr[i * page_size];
+
+ if (i % 2 == 0) {
+ bool result = try_read_write_buf(chrp);
+
+ ASSERT_FALSE(result);
+ } else {
+ ASSERT_EQ(*chrp, 'a' + i);
+ }
+ }
+
+ /* Now remove guard pages. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Now make sure all data is as expected. */
+ if (!check_pattern(ptr, 10, page_size))
+ ASSERT_TRUE(false);
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Check that file-backed mappings implement guard regions with MAP_PRIVATE
+ * correctly.
+ */
+TEST_F(guard_regions, map_private)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr_shared, *ptr_private;
+ int i;
+
+ if (variant->backing == ANON_BACKED)
+ SKIP(return, "MAP_PRIVATE test specific to file-backed");
+
+ ptr_shared = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr_shared, MAP_FAILED);
+
+ /* Manually mmap(), do not use mmap_() wrapper so we can force MAP_PRIVATE. */
+ ptr_private = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, self->fd, 0);
+ ASSERT_NE(ptr_private, MAP_FAILED);
+
+ /* Set pattern in shared mapping. */
+ set_pattern(ptr_shared, 10, page_size);
+
+ /* Install guard regions in every other page in the shared mapping. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr = &ptr_shared[i * page_size];
+
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ for (i = 0; i < 10; i++) {
+ /* Every even shared page should be guarded. */
+ ASSERT_EQ(try_read_buf(&ptr_shared[i * page_size]), i % 2 != 0);
+ /* Private mappings should always be readable. */
+ ASSERT_TRUE(try_read_buf(&ptr_private[i * page_size]));
+ }
+
+ /* Install guard regions in every other page in the private mapping. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr = &ptr_private[i * page_size];
+
+ ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ for (i = 0; i < 10; i++) {
+ /* Every even shared page should be guarded. */
+ ASSERT_EQ(try_read_buf(&ptr_shared[i * page_size]), i % 2 != 0);
+ /* Every odd private page should be guarded. */
+ ASSERT_EQ(try_read_buf(&ptr_private[i * page_size]), i % 2 != 0);
+ }
+
+ /* Remove guard regions from shared mapping. */
+ ASSERT_EQ(madvise(ptr_shared, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ for (i = 0; i < 10; i++) {
+ /* Shared mappings should always be readable. */
+ ASSERT_TRUE(try_read_buf(&ptr_shared[i * page_size]));
+ /* Every even private page should be guarded. */
+ ASSERT_EQ(try_read_buf(&ptr_private[i * page_size]), i % 2 != 0);
+ }
+
+ /* Remove guard regions from private mapping. */
+ ASSERT_EQ(madvise(ptr_private, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ for (i = 0; i < 10; i++) {
+ /* Shared mappings should always be readable. */
+ ASSERT_TRUE(try_read_buf(&ptr_shared[i * page_size]));
+ /* Private mappings should always be readable. */
+ ASSERT_TRUE(try_read_buf(&ptr_private[i * page_size]));
+ }
+
+ /* Ensure patterns are intact. */
+ ASSERT_TRUE(check_pattern(ptr_shared, 10, page_size));
+ ASSERT_TRUE(check_pattern(ptr_private, 10, page_size));
+
+ /* Now write out every other page to MAP_PRIVATE. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr = &ptr_private[i * page_size];
+
+ memset(ptr, 'a' + i, page_size);
+ }
+
+ /*
+ * At this point the mapping is:
+ *
+ * 0123456789
+ * SPSPSPSPSP
+ *
+ * Where S = shared, P = private mappings.
+ */
+
+ /* Now mark the beginning of the mapping guarded. */
+ ASSERT_EQ(madvise(ptr_private, 5 * page_size, MADV_GUARD_INSTALL), 0);
+
+ /*
+ * This renders the mapping:
+ *
+ * 0123456789
+ * xxxxxPSPSP
+ */
+
+ for (i = 0; i < 10; i++) {
+ char *ptr = &ptr_private[i * page_size];
+
+ /* Ensure guard regions as expected. */
+ ASSERT_EQ(try_read_buf(ptr), i >= 5);
+ /* The shared mapping should always succeed. */
+ ASSERT_TRUE(try_read_buf(&ptr_shared[i * page_size]));
+ }
+
+ /* Remove the guard regions altogether. */
+ ASSERT_EQ(madvise(ptr_private, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /*
+ *
+ * We now expect the mapping to be:
+ *
+ * 0123456789
+ * SSSSSPSPSP
+ *
+ * As we removed guard regions, the private pages from the first 5 will
+ * have been zapped, so on fault will reestablish the shared mapping.
+ */
+
+ for (i = 0; i < 10; i++) {
+ char *ptr = &ptr_private[i * page_size];
+
+ /*
+ * Assert that shared mappings in the MAP_PRIVATE mapping match
+ * the shared mapping.
+ */
+ if (i < 5 || i % 2 == 0) {
+ char *ptr_s = &ptr_shared[i * page_size];
+
+ ASSERT_EQ(memcmp(ptr, ptr_s, page_size), 0);
+ continue;
+ }
+
+ /* Everything else is a private mapping. */
+ ASSERT_TRUE(is_buf_eq(ptr, page_size, 'a' + i));
+ }
+
+ ASSERT_EQ(munmap(ptr_shared, 10 * page_size), 0);
+ ASSERT_EQ(munmap(ptr_private, 10 * page_size), 0);
+}
+
+/* Test that guard regions established over a read-only mapping function correctly. */
+TEST_F(guard_regions, readonly_file)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (variant->backing != LOCAL_FILE_BACKED)
+ SKIP(return, "Read-only test specific to file-backed");
+
+ /* Map shared so we can populate with pattern, populate it, unmap. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ set_pattern(ptr, 10, page_size);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+ /* Close the fd so we can re-open read-only. */
+ ASSERT_EQ(close(self->fd), 0);
+
+ /* Re-open read-only. */
+ self->fd = open(self->path, O_RDONLY);
+ ASSERT_NE(self->fd, -1);
+ /* Re-map read-only. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Mark every other page guarded. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_pg = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr_pg, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Assert that the guard regions are in place.*/
+ for (i = 0; i < 10; i++) {
+ char *ptr_pg = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_buf(ptr_pg), i % 2 != 0);
+ }
+
+ /* Remove guard regions. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Ensure the data is as expected. */
+ ASSERT_TRUE(check_pattern(ptr, 10, page_size));
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+TEST_F(guard_regions, fault_around)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (variant->backing == ANON_BACKED)
+ SKIP(return, "Fault-around test specific to file-backed");
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Establish a pattern in the backing file. */
+ set_pattern(ptr, 10, page_size);
+
+ /*
+ * Now drop it from the page cache so we get major faults when next we
+ * map it.
+ */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_PAGEOUT), 0);
+
+ /* Unmap and remap 'to be sure'. */
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Now make every even page guarded. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Now fault in every odd page. This should trigger fault-around. */
+ for (i = 1; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_buf(ptr_p));
+ }
+
+ /* Finally, ensure that guard regions are intact as expected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_buf(ptr_p), i % 2 != 0);
+ }
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+TEST_F(guard_regions, truncation)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (variant->backing == ANON_BACKED)
+ SKIP(return, "Truncation test specific to file-backed");
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Establish a pattern in the backing file, just so there is data
+ * there.
+ */
+ set_pattern(ptr, 10, page_size);
+
+ /* Now make every even page guarded. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Now assert things are as expected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_write_buf(ptr_p), i % 2 != 0);
+ }
+
+ /* Now truncate to actually used size (initialised to 100). */
+ ASSERT_EQ(ftruncate(self->fd, 10 * page_size), 0);
+
+ /* Here the guard regions will remain intact. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_write_buf(ptr_p), i % 2 != 0);
+ }
+
+ /* Now truncate to half the size, then truncate again to the full size. */
+ ASSERT_EQ(ftruncate(self->fd, 5 * page_size), 0);
+ ASSERT_EQ(ftruncate(self->fd, 10 * page_size), 0);
+
+ /* Again, guard pages will remain intact. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_write_buf(ptr_p), i % 2 != 0);
+ }
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+TEST_F(guard_regions, hole_punch)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (variant->backing == ANON_BACKED)
+ SKIP(return, "Truncation test specific to file-backed");
+
+ /* Establish pattern in mapping. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ set_pattern(ptr, 10, page_size);
+
+ /* Install a guard region in the middle of the mapping. */
+ ASSERT_EQ(madvise(&ptr[3 * page_size], 4 * page_size,
+ MADV_GUARD_INSTALL), 0);
+
+ /*
+ * The buffer will now be:
+ *
+ * 0123456789
+ * ***xxxx***
+ *
+ * Where * is data and x is the guard region.
+ */
+
+ /* Ensure established. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_buf(ptr_p), i < 3 || i >= 7);
+ }
+
+ /* Now hole punch the guarded region. */
+ ASSERT_EQ(madvise(&ptr[3 * page_size], 4 * page_size,
+ MADV_REMOVE), 0);
+
+ /* Ensure guard regions remain. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_buf(ptr_p), i < 3 || i >= 7);
+ }
+
+ /* Now remove guard region throughout. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Check that the pattern exists in non-hole punched region. */
+ ASSERT_TRUE(check_pattern(ptr, 3, page_size));
+ /* Check that hole punched region is zeroed. */
+ ASSERT_TRUE(is_buf_eq(&ptr[3 * page_size], 4 * page_size, '\0'));
+ /* Check that the pattern exists in the remainder of the file. */
+ ASSERT_TRUE(check_pattern_offset(ptr, 3, page_size, 7));
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Ensure that a memfd works correctly with guard regions, that we can write
+ * seal it then open the mapping read-only and still establish guard regions
+ * within, remove those guard regions and have everything work correctly.
+ */
+TEST_F(guard_regions, memfd_write_seal)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (variant->backing != SHMEM_BACKED)
+ SKIP(return, "memfd write seal test specific to shmem");
+
+ /* OK, we need a memfd, so close existing one. */
+ ASSERT_EQ(close(self->fd), 0);
+
+ /* Create and truncate memfd. */
+ self->fd = memfd_create("guard_regions_memfd_seals_test",
+ MFD_ALLOW_SEALING);
+ ASSERT_NE(self->fd, -1);
+ ASSERT_EQ(ftruncate(self->fd, 10 * page_size), 0);
+
+ /* Map, set pattern, unmap. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ set_pattern(ptr, 10, page_size);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+
+ /* Write-seal the memfd. */
+ ASSERT_EQ(fcntl(self->fd, F_ADD_SEALS, F_SEAL_WRITE), 0);
+
+ /* Now map the memfd readonly. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Ensure pattern is as expected. */
+ ASSERT_TRUE(check_pattern(ptr, 10, page_size));
+
+ /* Now make every even page guarded. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Now assert things are as expected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_buf(ptr_p), i % 2 != 0);
+ }
+
+ /* Now remove guard regions. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Ensure pattern is as expected. */
+ ASSERT_TRUE(check_pattern(ptr, 10, page_size));
+
+ /* Ensure write seal intact. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_FALSE(try_write_buf(ptr_p));
+ }
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+
+/*
+ * Since we are now permitted to establish guard regions in read-only anonymous
+ * mappings, for the sake of thoroughness, though it probably has no practical
+ * use, test that guard regions function with a mapping to the anonymous zero
+ * page.
+ */
+TEST_F(guard_regions, anon_zeropage)
+{
+ const unsigned long page_size = self->page_size;
+ char *ptr;
+ int i;
+
+ if (!is_anon_backed(variant))
+ SKIP(return, "anon zero page test specific to anon/shmem");
+
+ /* Obtain a read-only i.e. anon zero page mapping. */
+ ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Now make every even page guarded. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Now assert things are as expected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(try_read_buf(ptr_p), i % 2 != 0);
+ }
+
+ /* Now remove all guard regions. */
+ ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0);
+
+ /* Now assert things are as expected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_TRUE(try_read_buf(ptr_p));
+ }
+
+ /* Ensure zero page...*/
+ ASSERT_TRUE(is_buf_eq(ptr, 10 * page_size, '\0'));
+
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that /proc/$pid/pagemap correctly identifies guard region ranges.
+ */
+TEST_F(guard_regions, pagemap)
+{
+ const unsigned long page_size = self->page_size;
+ int proc_fd;
+ char *ptr;
+ int i;
+
+ proc_fd = open("/proc/self/pagemap", O_RDONLY);
+ ASSERT_NE(proc_fd, -1);
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /* Read from pagemap, and assert no guard regions are detected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+ unsigned long entry = pagemap_get_entry(proc_fd, ptr_p);
+ unsigned long masked = entry & PM_GUARD_REGION;
+
+ ASSERT_EQ(masked, 0);
+ }
+
+ /* Install a guard region in every other page. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(madvise(ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /* Re-read from pagemap, and assert guard regions are detected. */
+ for (i = 0; i < 10; i++) {
+ char *ptr_p = &ptr[i * page_size];
+ unsigned long entry = pagemap_get_entry(proc_fd, ptr_p);
+ unsigned long masked = entry & PM_GUARD_REGION;
+
+ ASSERT_EQ(masked, i % 2 == 0 ? PM_GUARD_REGION : 0);
+ }
+
+ ASSERT_EQ(close(proc_fd), 0);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+/*
+ * Assert that PAGEMAP_SCAN correctly reports guard region ranges.
+ */
+TEST_F(guard_regions, pagemap_scan)
+{
+ const unsigned long page_size = self->page_size;
+ struct page_region pm_regs[10];
+ struct pm_scan_arg pm_scan_args = {
+ .size = sizeof(struct pm_scan_arg),
+ .category_anyof_mask = PAGE_IS_GUARD,
+ .return_mask = PAGE_IS_GUARD,
+ .vec = (long)&pm_regs,
+ .vec_len = ARRAY_SIZE(pm_regs),
+ };
+ int proc_fd, i;
+ char *ptr;
+
+ proc_fd = open("/proc/self/pagemap", O_RDONLY);
+ ASSERT_NE(proc_fd, -1);
+
+ ptr = mmap_(self, variant, NULL, 10 * page_size,
+ PROT_READ | PROT_WRITE, 0, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ pm_scan_args.start = (long)ptr;
+ pm_scan_args.end = (long)ptr + 10 * page_size;
+ ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 0);
+ ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size);
+
+ /* Install a guard region in every other page. */
+ for (i = 0; i < 10; i += 2) {
+ char *ptr_p = &ptr[i * page_size];
+
+ ASSERT_EQ(syscall(__NR_madvise, ptr_p, page_size, MADV_GUARD_INSTALL), 0);
+ }
+
+ /*
+ * Assert ioctl() returns the count of located regions, where each
+ * region spans every other page within the range of 10 pages.
+ */
+ ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 5);
+ ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size);
+
+ /* Re-read from pagemap, and assert guard regions are detected. */
+ for (i = 0; i < 5; i++) {
+ long ptr_p = (long)&ptr[2 * i * page_size];
+
+ ASSERT_EQ(pm_regs[i].start, ptr_p);
+ ASSERT_EQ(pm_regs[i].end, ptr_p + page_size);
+ ASSERT_EQ(pm_regs[i].categories, PAGE_IS_GUARD);
+ }
+
+ ASSERT_EQ(close(proc_fd), 0);
+ ASSERT_EQ(munmap(ptr, 10 * page_size), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c
new file mode 100644
index 000000000000..29047d2e0c49
--- /dev/null
+++ b/tools/testing/selftests/mm/gup_longterm.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GUP long-term page pinning tests.
+ *
+ * Copyright 2023, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
+#include <linux/memfd.h>
+
+#include "local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+#include <liburing.h>
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+#include "../../../../mm/gup_test.h"
+#include "../kselftest.h"
+#include "vm_util.h"
+
+static size_t pagesize;
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+static int gup_fd;
+
+static __fsword_t get_fs_type(int fd)
+{
+ struct statfs fs;
+ int ret;
+
+ do {
+ ret = fstatfs(fd, &fs);
+ } while (ret && errno == EINTR);
+
+ return ret ? 0 : fs.f_type;
+}
+
+static bool fs_is_unknown(__fsword_t fs_type)
+{
+ /*
+ * We only support some filesystems in our tests when dealing with
+ * R/W long-term pinning. For these filesystems, we can be fairly sure
+ * whether they support it or not.
+ */
+ switch (fs_type) {
+ case TMPFS_MAGIC:
+ case HUGETLBFS_MAGIC:
+ case BTRFS_SUPER_MAGIC:
+ case EXT4_SUPER_MAGIC:
+ case XFS_SUPER_MAGIC:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static bool fs_supports_writable_longterm_pinning(__fsword_t fs_type)
+{
+ assert(!fs_is_unknown(fs_type));
+ switch (fs_type) {
+ case TMPFS_MAGIC:
+ case HUGETLBFS_MAGIC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+enum test_type {
+ TEST_TYPE_RO,
+ TEST_TYPE_RO_FAST,
+ TEST_TYPE_RW,
+ TEST_TYPE_RW_FAST,
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+ TEST_TYPE_IOURING,
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+};
+
+static void do_test(int fd, size_t size, enum test_type type, bool shared)
+{
+ __fsword_t fs_type = get_fs_type(fd);
+ bool should_work;
+ char *mem;
+ int result = KSFT_PASS;
+ int ret;
+
+ if (fd < 0) {
+ result = KSFT_FAIL;
+ goto report;
+ }
+
+ if (ftruncate(fd, size)) {
+ if (errno == ENOENT) {
+ skip_test_dodgy_fs("ftruncate()");
+ } else {
+ ksft_print_msg("ftruncate() failed (%s)\n",
+ strerror(errno));
+ result = KSFT_FAIL;
+ goto report;
+ }
+ return;
+ }
+
+ if (fallocate(fd, 0, 0, size)) {
+ if (size == pagesize) {
+ ksft_print_msg("fallocate() failed (%s)\n", strerror(errno));
+ result = KSFT_FAIL;
+ } else {
+ ksft_print_msg("need more free huge pages\n");
+ result = KSFT_SKIP;
+ }
+ goto report;
+ }
+
+ mem = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ shared ? MAP_SHARED : MAP_PRIVATE, fd, 0);
+ if (mem == MAP_FAILED) {
+ if (size == pagesize || shared) {
+ ksft_print_msg("mmap() failed (%s)\n", strerror(errno));
+ result = KSFT_FAIL;
+ } else {
+ ksft_print_msg("need more free huge pages\n");
+ result = KSFT_SKIP;
+ }
+ goto report;
+ }
+
+ /* Fault in the page such that GUP-fast can pin it directly. */
+ memset(mem, 0, size);
+
+ switch (type) {
+ case TEST_TYPE_RO:
+ case TEST_TYPE_RO_FAST:
+ /*
+ * Cover more cases regarding unsharing decisions when
+ * long-term R/O pinning by mapping the page R/O.
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ if (ret) {
+ ksft_print_msg("mprotect() failed (%s)\n", strerror(errno));
+ result = KSFT_FAIL;
+ goto munmap;
+ }
+ /* FALLTHROUGH */
+ case TEST_TYPE_RW:
+ case TEST_TYPE_RW_FAST: {
+ struct pin_longterm_test args;
+ const bool fast = type == TEST_TYPE_RO_FAST ||
+ type == TEST_TYPE_RW_FAST;
+ const bool rw = type == TEST_TYPE_RW ||
+ type == TEST_TYPE_RW_FAST;
+
+ if (gup_fd < 0) {
+ ksft_print_msg("gup_test not available\n");
+ result = KSFT_SKIP;
+ break;
+ }
+
+ if (rw && shared && fs_is_unknown(fs_type)) {
+ ksft_print_msg("Unknown filesystem\n");
+ result = KSFT_SKIP;
+ return;
+ }
+ /*
+ * R/O pinning or pinning in a private mapping is always
+ * expected to work. Otherwise, we expect long-term R/W pinning
+ * to only succeed for special filesystems.
+ */
+ should_work = !shared || !rw ||
+ fs_supports_writable_longterm_pinning(fs_type);
+
+ args.addr = (__u64)(uintptr_t)mem;
+ args.size = size;
+ args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
+ args.flags |= rw ? PIN_LONGTERM_TEST_FLAG_USE_WRITE : 0;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
+ if (ret && errno == EINVAL) {
+ ksft_print_msg("PIN_LONGTERM_TEST_START failed (EINVAL)n");
+ result = KSFT_SKIP;
+ break;
+ } else if (ret && errno == EFAULT) {
+ if (should_work)
+ result = KSFT_FAIL;
+ else
+ result = KSFT_PASS;
+ break;
+ } else if (ret) {
+ ksft_print_msg("PIN_LONGTERM_TEST_START failed (%s)\n",
+ strerror(errno));
+ result = KSFT_FAIL;
+ break;
+ }
+
+ if (ioctl(gup_fd, PIN_LONGTERM_TEST_STOP))
+ ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed (%s)\n",
+ strerror(errno));
+
+ /*
+ * TODO: if the kernel ever supports long-term R/W pinning on
+ * some previously unsupported filesystems, we might want to
+ * perform some additional tests for possible data corruptions.
+ */
+ if (should_work)
+ result = KSFT_PASS;
+ else
+ result = KSFT_FAIL;
+ break;
+ }
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+ case TEST_TYPE_IOURING: {
+ struct io_uring ring;
+ struct iovec iov;
+
+ /* io_uring always pins pages writable. */
+ if (shared && fs_is_unknown(fs_type)) {
+ ksft_print_msg("Unknown filesystem\n");
+ result = KSFT_SKIP;
+ goto report;
+ }
+ should_work = !shared ||
+ fs_supports_writable_longterm_pinning(fs_type);
+
+ /* Skip on errors, as we might just lack kernel support. */
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret < 0) {
+ ksft_print_msg("io_uring_queue_init() failed (%s)\n",
+ strerror(-ret));
+ result = KSFT_SKIP;
+ break;
+ }
+ /*
+ * Register the range as a fixed buffer. This will FOLL_WRITE |
+ * FOLL_PIN | FOLL_LONGTERM the range.
+ */
+ iov.iov_base = mem;
+ iov.iov_len = size;
+ ret = io_uring_register_buffers(&ring, &iov, 1);
+ /* Only new kernels return EFAULT. */
+ if (ret && (errno == ENOSPC || errno == EOPNOTSUPP ||
+ errno == EFAULT)) {
+ if (should_work) {
+ ksft_print_msg("Should have failed (%s)\n",
+ strerror(errno));
+ result = KSFT_FAIL;
+ } else {
+ result = KSFT_PASS;
+ }
+ } else if (ret) {
+ /*
+ * We might just lack support or have insufficient
+ * MEMLOCK limits.
+ */
+ ksft_print_msg("io_uring_register_buffers() failed (%s)\n",
+ strerror(-ret));
+ result = KSFT_SKIP;
+ } else {
+ if (should_work) {
+ result = KSFT_PASS;
+ } else {
+ ksft_print_msg("Should have worked\n");
+ result = KSFT_FAIL;
+ }
+ io_uring_unregister_buffers(&ring);
+ }
+
+ io_uring_queue_exit(&ring);
+ break;
+ }
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+ default:
+ assert(false);
+ }
+
+munmap:
+ munmap(mem, size);
+report:
+ log_test_result(result);
+}
+
+typedef void (*test_fn)(int fd, size_t size);
+
+static void run_with_memfd(test_fn fn, const char *desc)
+{
+ int fd;
+
+ log_test_start("%s ... with memfd", desc);
+
+ fd = memfd_create("test", 0);
+ if (fd < 0) {
+ ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ fn(fd, pagesize);
+ close(fd);
+}
+
+static void run_with_tmpfile(test_fn fn, const char *desc)
+{
+ FILE *file;
+ int fd;
+
+ log_test_start("%s ... with tmpfile", desc);
+
+ file = tmpfile();
+ if (!file) {
+ ksft_print_msg("tmpfile() failed (%s)\n", strerror(errno));
+ fd = -1;
+ } else {
+ fd = fileno(file);
+ if (fd < 0) {
+ ksft_print_msg("fileno() failed (%s)\n", strerror(errno));
+ }
+ }
+
+ fn(fd, pagesize);
+
+ if (file)
+ fclose(file);
+}
+
+static void run_with_local_tmpfile(test_fn fn, const char *desc)
+{
+ char filename[] = __FILE__"_tmpfile_XXXXXX";
+ int fd;
+
+ log_test_start("%s ... with local tmpfile", desc);
+
+ fd = mkstemp(filename);
+ if (fd < 0)
+ ksft_print_msg("mkstemp() failed (%s)\n", strerror(errno));
+
+ if (unlink(filename)) {
+ ksft_print_msg("unlink() failed (%s)\n", strerror(errno));
+ close(fd);
+ fd = -1;
+ }
+
+ fn(fd, pagesize);
+
+ if (fd >= 0)
+ close(fd);
+}
+
+static void run_with_memfd_hugetlb(test_fn fn, const char *desc,
+ size_t hugetlbsize)
+{
+ int flags = MFD_HUGETLB;
+ int fd;
+
+ log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
+ hugetlbsize / 1024);
+
+ flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
+
+ fd = memfd_create("test", flags);
+ if (fd < 0) {
+ ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
+ log_test_result(KSFT_SKIP);
+ return;
+ }
+
+ fn(fd, hugetlbsize);
+ close(fd);
+}
+
+struct test_case {
+ const char *desc;
+ test_fn fn;
+};
+
+static void test_shared_rw_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RW, true);
+}
+
+static void test_shared_rw_fast_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RW_FAST, true);
+}
+
+static void test_shared_ro_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RO, true);
+}
+
+static void test_shared_ro_fast_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RO_FAST, true);
+}
+
+static void test_private_rw_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RW, false);
+}
+
+static void test_private_rw_fast_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RW_FAST, false);
+}
+
+static void test_private_ro_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RO, false);
+}
+
+static void test_private_ro_fast_pin(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_RO_FAST, false);
+}
+
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+static void test_shared_iouring(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_IOURING, true);
+}
+
+static void test_private_iouring(int fd, size_t size)
+{
+ do_test(fd, size, TEST_TYPE_IOURING, false);
+}
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+static const struct test_case test_cases[] = {
+ {
+ "R/W longterm GUP pin in MAP_SHARED file mapping",
+ test_shared_rw_pin,
+ },
+ {
+ "R/W longterm GUP-fast pin in MAP_SHARED file mapping",
+ test_shared_rw_fast_pin,
+ },
+ {
+ "R/O longterm GUP pin in MAP_SHARED file mapping",
+ test_shared_ro_pin,
+ },
+ {
+ "R/O longterm GUP-fast pin in MAP_SHARED file mapping",
+ test_shared_ro_fast_pin,
+ },
+ {
+ "R/W longterm GUP pin in MAP_PRIVATE file mapping",
+ test_private_rw_pin,
+ },
+ {
+ "R/W longterm GUP-fast pin in MAP_PRIVATE file mapping",
+ test_private_rw_fast_pin,
+ },
+ {
+ "R/O longterm GUP pin in MAP_PRIVATE file mapping",
+ test_private_ro_pin,
+ },
+ {
+ "R/O longterm GUP-fast pin in MAP_PRIVATE file mapping",
+ test_private_ro_fast_pin,
+ },
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+ {
+ "io_uring fixed buffer with MAP_SHARED file mapping",
+ test_shared_iouring,
+ },
+ {
+ "io_uring fixed buffer with MAP_PRIVATE file mapping",
+ test_private_iouring,
+ },
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+};
+
+static void run_test_case(struct test_case const *test_case)
+{
+ int i;
+
+ run_with_memfd(test_case->fn, test_case->desc);
+ run_with_tmpfile(test_case->fn, test_case->desc);
+ run_with_local_tmpfile(test_case->fn, test_case->desc);
+ for (i = 0; i < nr_hugetlbsizes; i++)
+ run_with_memfd_hugetlb(test_case->fn, test_case->desc,
+ hugetlbsizes[i]);
+}
+
+static int tests_per_test_case(void)
+{
+ return 3 + nr_hugetlbsizes;
+}
+
+int main(int argc, char **argv)
+{
+ int i;
+
+ pagesize = getpagesize();
+ nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
+ ARRAY_SIZE(hugetlbsizes));
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(test_cases) * tests_per_test_case());
+
+ gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+ run_test_case(&test_cases[i]);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c
new file mode 100644
index 000000000000..bdeaac67ff9a
--- /dev/null
+++ b/tools/testing/selftests/mm/gup_test.c
@@ -0,0 +1,272 @@
+#define __SANE_USERSPACE_TYPES__ // Use ll64
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <assert.h>
+#include <mm/gup_test.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define MB (1UL << 20)
+
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01 /* check pte is writable */
+#define FOLL_TOUCH 0x02 /* mark page accessed */
+
+#define GUP_TEST_FILE "/sys/kernel/debug/gup_test"
+
+static unsigned long cmd = GUP_FAST_BENCHMARK;
+static int gup_fd, repeats = 1;
+static unsigned long size = 128 * MB;
+/* Serialize prints */
+static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static char *cmd_to_str(unsigned long cmd)
+{
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ return "GUP_FAST_BENCHMARK";
+ case PIN_FAST_BENCHMARK:
+ return "PIN_FAST_BENCHMARK";
+ case PIN_LONGTERM_BENCHMARK:
+ return "PIN_LONGTERM_BENCHMARK";
+ case GUP_BASIC_TEST:
+ return "GUP_BASIC_TEST";
+ case PIN_BASIC_TEST:
+ return "PIN_BASIC_TEST";
+ case DUMP_USER_PAGES_TEST:
+ return "DUMP_USER_PAGES_TEST";
+ }
+ return "Unknown command";
+}
+
+void *gup_thread(void *data)
+{
+ struct gup_test gup = *(struct gup_test *)data;
+ int i, status;
+
+ /* Only report timing information on the *_BENCHMARK commands: */
+ if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
+ (cmd == PIN_LONGTERM_BENCHMARK)) {
+ for (i = 0; i < repeats; i++) {
+ gup.size = size;
+ status = ioctl(gup_fd, cmd, &gup);
+ if (status)
+ break;
+
+ pthread_mutex_lock(&print_mutex);
+ ksft_print_msg("%s: Time: get:%lld put:%lld us",
+ cmd_to_str(cmd), gup.get_delta_usec,
+ gup.put_delta_usec);
+ if (gup.size != size)
+ ksft_print_msg(", truncated (size: %lld)", gup.size);
+ ksft_print_msg("\n");
+ pthread_mutex_unlock(&print_mutex);
+ }
+ } else {
+ gup.size = size;
+ status = ioctl(gup_fd, cmd, &gup);
+ if (status)
+ goto return_;
+
+ pthread_mutex_lock(&print_mutex);
+ ksft_print_msg("%s: done\n", cmd_to_str(cmd));
+ if (gup.size != size)
+ ksft_print_msg("Truncated (size: %lld)\n", gup.size);
+ pthread_mutex_unlock(&print_mutex);
+ }
+
+return_:
+ ksft_test_result(!status, "ioctl status %d\n", status);
+ return NULL;
+}
+
+int main(int argc, char **argv)
+{
+ struct gup_test gup = { 0 };
+ int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret;
+ int flags = MAP_PRIVATE, touch = 0;
+ char *file = "/dev/zero";
+ pthread_t *tid;
+ char *p;
+
+ while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) {
+ switch (opt) {
+ case 'a':
+ cmd = PIN_FAST_BENCHMARK;
+ break;
+ case 'b':
+ cmd = PIN_BASIC_TEST;
+ break;
+ case 'L':
+ cmd = PIN_LONGTERM_BENCHMARK;
+ break;
+ case 'c':
+ cmd = DUMP_USER_PAGES_TEST;
+ /*
+ * Dump page 0 (index 1). May be overridden later, by
+ * user's non-option arguments.
+ *
+ * .which_pages is zero-based, so that zero can mean "do
+ * nothing".
+ */
+ gup.which_pages[0] = 1;
+ break;
+ case 'p':
+ /* works only with DUMP_USER_PAGES_TEST */
+ gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
+ break;
+ case 'F':
+ /* strtol, so you can pass flags in hex form */
+ gup.gup_flags = strtol(optarg, 0, 0);
+ break;
+ case 'j':
+ nthreads = atoi(optarg);
+ break;
+ case 'm':
+ size = atoi(optarg) * MB;
+ break;
+ case 'r':
+ repeats = atoi(optarg);
+ break;
+ case 'n':
+ nr_pages = atoi(optarg);
+ break;
+ case 't':
+ thp = 1;
+ break;
+ case 'T':
+ thp = 0;
+ break;
+ case 'U':
+ cmd = GUP_BASIC_TEST;
+ break;
+ case 'u':
+ cmd = GUP_FAST_BENCHMARK;
+ break;
+ case 'w':
+ write = 1;
+ break;
+ case 'W':
+ write = 0;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 'S':
+ flags &= ~MAP_PRIVATE;
+ flags |= MAP_SHARED;
+ break;
+ case 'H':
+ flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
+ break;
+ case 'z':
+ /* fault pages in gup, do not fault in userland */
+ touch = 1;
+ break;
+ default:
+ ksft_exit_fail_msg("Wrong argument\n");
+ }
+ }
+
+ if (optind < argc) {
+ int extra_arg_count = 0;
+ /*
+ * For example:
+ *
+ * ./gup_test -c 0 1 0x1001
+ *
+ * ...to dump pages 0, 1, and 4097
+ */
+
+ while ((optind < argc) &&
+ (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) {
+ /*
+ * Do the 1-based indexing here, so that the user can
+ * use normal 0-based indexing on the command line.
+ */
+ long page_index = strtol(argv[optind], 0, 0) + 1;
+
+ gup.which_pages[extra_arg_count] = page_index;
+ extra_arg_count++;
+ optind++;
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(nthreads);
+
+ filed = open(file, O_RDWR|O_CREAT, 0664);
+ if (filed < 0)
+ ksft_exit_fail_msg("Unable to open %s: %s\n", file, strerror(errno));
+
+ gup.nr_pages_per_call = nr_pages;
+ if (write)
+ gup.gup_flags |= FOLL_WRITE;
+
+ gup_fd = open(GUP_TEST_FILE, O_RDWR);
+ if (gup_fd == -1) {
+ switch (errno) {
+ case EACCES:
+ if (getuid())
+ ksft_print_msg("Please run this test as root\n");
+ break;
+ case ENOENT:
+ if (opendir("/sys/kernel/debug") == NULL)
+ ksft_print_msg("mount debugfs at /sys/kernel/debug\n");
+ ksft_print_msg("check if CONFIG_GUP_TEST is enabled in kernel config\n");
+ break;
+ default:
+ ksft_print_msg("failed to open %s: %s\n", GUP_TEST_FILE, strerror(errno));
+ break;
+ }
+ ksft_test_result_skip("Please run this test as root\n");
+ ksft_exit_pass();
+ }
+
+ p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
+ if (p == MAP_FAILED)
+ ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+ gup.addr = (unsigned long)p;
+
+ if (thp == 1)
+ madvise(p, size, MADV_HUGEPAGE);
+ else if (thp == 0)
+ madvise(p, size, MADV_NOHUGEPAGE);
+
+ /*
+ * FOLL_TOUCH, in gup_test, is used as an either/or case: either
+ * fault pages in from the kernel via FOLL_TOUCH, or fault them
+ * in here, from user space. This allows comparison of performance
+ * between those two cases.
+ */
+ if (touch) {
+ gup.gup_flags |= FOLL_TOUCH;
+ } else {
+ for (; (unsigned long)p < gup.addr + size; p += psize())
+ p[0] = 0;
+ }
+
+ tid = malloc(sizeof(pthread_t) * nthreads);
+ assert(tid);
+ for (i = 0; i < nthreads; i++) {
+ ret = pthread_create(&tid[i], NULL, gup_thread, &gup);
+ assert(ret == 0);
+ }
+ for (i = 0; i < nthreads; i++) {
+ ret = pthread_join(tid[i], NULL);
+ assert(ret == 0);
+ }
+
+ free(tid);
+
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
new file mode 100644
index 000000000000..141bf63cbe05
--- /dev/null
+++ b/tools/testing/selftests/mm/hmm-tests.c
@@ -0,0 +1,2059 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HMM stands for Heterogeneous Memory Management, it is a helper layer inside
+ * the linux kernel to help device drivers mirror a process address space in
+ * the device. This allows the device to use the same address space which
+ * makes communication and data exchange a lot easier.
+ *
+ * This framework's sole purpose is to exercise various code paths inside
+ * the kernel to make sure that HMM performs as expected and to flush out any
+ * bugs.
+ */
+
+#include "../kselftest_harness.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <strings.h>
+#include <time.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+
+/*
+ * This is a private UAPI to the kernel test module so it isn't exported
+ * in the usual include/uapi/... directory.
+ */
+#include <lib/test_hmm_uapi.h>
+#include <mm/gup_test.h>
+
+struct hmm_buffer {
+ void *ptr;
+ void *mirror;
+ unsigned long size;
+ int fd;
+ uint64_t cpages;
+ uint64_t faults;
+};
+
+enum {
+ HMM_PRIVATE_DEVICE_ONE,
+ HMM_PRIVATE_DEVICE_TWO,
+ HMM_COHERENCE_DEVICE_ONE,
+ HMM_COHERENCE_DEVICE_TWO,
+};
+
+#define TWOMEG (1 << 21)
+#define HMM_BUFFER_SIZE (1024 << 12)
+#define HMM_PATH_MAX 64
+#define NTIMES 10
+
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+
+#ifndef FOLL_WRITE
+#define FOLL_WRITE 0x01 /* check pte is writable */
+#endif
+
+#ifndef FOLL_LONGTERM
+#define FOLL_LONGTERM 0x100 /* mapping lifetime is indefinite */
+#endif
+FIXTURE(hmm)
+{
+ int fd;
+ unsigned int page_size;
+ unsigned int page_shift;
+};
+
+FIXTURE_VARIANT(hmm)
+{
+ int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+ .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+ .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
+FIXTURE(hmm2)
+{
+ int fd0;
+ int fd1;
+ unsigned int page_size;
+ unsigned int page_shift;
+};
+
+FIXTURE_VARIANT(hmm2)
+{
+ int device_number0;
+ int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+ .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+ .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+ .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+ .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
+static int hmm_open(int unit)
+{
+ char pathname[HMM_PATH_MAX];
+ int fd;
+
+ snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit);
+ fd = open(pathname, O_RDWR, 0);
+ if (fd < 0)
+ fprintf(stderr, "could not open hmm dmirror driver (%s)\n",
+ pathname);
+ return fd;
+}
+
+static bool hmm_is_coherent_type(int dev_num)
+{
+ return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
+FIXTURE_SETUP(hmm)
+{
+ self->page_size = sysconf(_SC_PAGE_SIZE);
+ self->page_shift = ffs(self->page_size) - 1;
+
+ self->fd = hmm_open(variant->device_number);
+ if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+ SKIP(return, "DEVICE_COHERENT not available");
+ ASSERT_GE(self->fd, 0);
+}
+
+FIXTURE_SETUP(hmm2)
+{
+ self->page_size = sysconf(_SC_PAGE_SIZE);
+ self->page_shift = ffs(self->page_size) - 1;
+
+ self->fd0 = hmm_open(variant->device_number0);
+ if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+ SKIP(return, "DEVICE_COHERENT not available");
+ ASSERT_GE(self->fd0, 0);
+ self->fd1 = hmm_open(variant->device_number1);
+ ASSERT_GE(self->fd1, 0);
+}
+
+FIXTURE_TEARDOWN(hmm)
+{
+ int ret = close(self->fd);
+
+ ASSERT_EQ(ret, 0);
+ self->fd = -1;
+}
+
+FIXTURE_TEARDOWN(hmm2)
+{
+ int ret = close(self->fd0);
+
+ ASSERT_EQ(ret, 0);
+ self->fd0 = -1;
+
+ ret = close(self->fd1);
+ ASSERT_EQ(ret, 0);
+ self->fd1 = -1;
+}
+
+static int hmm_dmirror_cmd(int fd,
+ unsigned long request,
+ struct hmm_buffer *buffer,
+ unsigned long npages)
+{
+ struct hmm_dmirror_cmd cmd;
+ int ret;
+
+ /* Simulate a device reading system memory. */
+ cmd.addr = (__u64)buffer->ptr;
+ cmd.ptr = (__u64)buffer->mirror;
+ cmd.npages = npages;
+
+ for (;;) {
+ ret = ioctl(fd, request, &cmd);
+ if (ret == 0)
+ break;
+ if (errno == EINTR)
+ continue;
+ return -errno;
+ }
+ buffer->cpages = cmd.cpages;
+ buffer->faults = cmd.faults;
+
+ return 0;
+}
+
+static void hmm_buffer_free(struct hmm_buffer *buffer)
+{
+ if (buffer == NULL)
+ return;
+
+ if (buffer->ptr)
+ munmap(buffer->ptr, buffer->size);
+ free(buffer->mirror);
+ free(buffer);
+}
+
+/*
+ * Create a temporary file that will be deleted on close.
+ */
+static int hmm_create_file(unsigned long size)
+{
+ char path[HMM_PATH_MAX];
+ int fd;
+
+ strcpy(path, "/tmp");
+ fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600);
+ if (fd >= 0) {
+ int r;
+
+ do {
+ r = ftruncate(fd, size);
+ } while (r == -1 && errno == EINTR);
+ if (!r)
+ return fd;
+ close(fd);
+ }
+ return -1;
+}
+
+/*
+ * Return a random unsigned number.
+ */
+static unsigned int hmm_random(void)
+{
+ static int fd = -1;
+ unsigned int r;
+
+ if (fd < 0) {
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "%s:%d failed to open /dev/urandom\n",
+ __FILE__, __LINE__);
+ return ~0U;
+ }
+ }
+ read(fd, &r, sizeof(r));
+ return r;
+}
+
+static void hmm_nanosleep(unsigned int n)
+{
+ struct timespec t;
+
+ t.tv_sec = 0;
+ t.tv_nsec = n;
+ nanosleep(&t, NULL);
+}
+
+static int hmm_migrate_sys_to_dev(int fd,
+ struct hmm_buffer *buffer,
+ unsigned long npages)
+{
+ return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+ struct hmm_buffer *buffer,
+ unsigned long npages)
+{
+ return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
+/*
+ * Simple NULL test of device open/close.
+ */
+TEST_F(hmm, open_close)
+{
+}
+
+/*
+ * Read private anonymous memory.
+ */
+TEST_F(hmm, anon_read)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ int val;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /*
+ * Initialize buffer in system memory but leave the first two pages
+ * zero (pte_none and pfn_zero).
+ */
+ i = 2 * self->page_size / sizeof(*ptr);
+ for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Set buffer permission to read-only. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Populate the CPU page table with a special zero page. */
+ val = *(int *)(buffer->ptr + self->page_size);
+ ASSERT_EQ(val, 0);
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ ptr = buffer->mirror;
+ for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], 0);
+ for (; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Read private anonymous memory which has been protected with
+ * mprotect() PROT_NONE.
+ */
+TEST_F(hmm, anon_read_prot)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Initialize mirror buffer so we can verify it isn't written. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = -i;
+
+ /* Protect buffer from reading. */
+ ret = mprotect(buffer->ptr, size, PROT_NONE);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, -EFAULT);
+
+ /* Allow CPU to read the buffer so we can check it. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory.
+ */
+TEST_F(hmm, anon_write)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory which has been protected with
+ * mprotect() PROT_READ.
+ */
+TEST_F(hmm, anon_write_prot)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device reading a zero page of memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 1);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, -EPERM);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], 0);
+
+ /* Now allow writing and see that the zero page is replaced. */
+ ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Check that a device writing an anonymous private mapping
+ * will copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ pid_t pid;
+ int child_fd;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer->ptr so we can tell if it is written. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = -i;
+
+ pid = fork();
+ if (pid == -1)
+ ASSERT_EQ(pid, 0);
+ if (pid != 0) {
+ waitpid(pid, &ret, 0);
+ ASSERT_EQ(WIFEXITED(ret), 1);
+
+ /* Check that the parent's buffer did not change. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+ return;
+ }
+
+ /* Check that we see the parent's values. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ /* The child process needs its own mirror to its own mm. */
+ child_fd = hmm_open(0);
+ ASSERT_GE(child_fd, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ close(child_fd);
+ exit(0);
+}
+
+/*
+ * Check that a device writing an anonymous shared mapping
+ * will not copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child_shared)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ pid_t pid;
+ int child_fd;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer->ptr so we can tell if it is written. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = -i;
+
+ pid = fork();
+ if (pid == -1)
+ ASSERT_EQ(pid, 0);
+ if (pid != 0) {
+ waitpid(pid, &ret, 0);
+ ASSERT_EQ(WIFEXITED(ret), 1);
+
+ /* Check that the parent's buffer did change. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+ return;
+ }
+
+ /* Check that we see the parent's values. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ /* The child process needs its own mirror to its own mm. */
+ child_fd = hmm_open(0);
+ ASSERT_GE(child_fd, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ close(child_fd);
+ exit(0);
+}
+
+/*
+ * Write private anonymous huge page.
+ */
+TEST_F(hmm, anon_write_huge)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ void *old_ptr;
+ void *map;
+ int *ptr;
+ int ret;
+
+ size = 2 * TWOMEG;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ size = TWOMEG;
+ npages = size >> self->page_shift;
+ map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
+ ret = madvise(map, size, MADV_HUGEPAGE);
+ ASSERT_EQ(ret, 0);
+ old_ptr = buffer->ptr;
+ buffer->ptr = map;
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ buffer->ptr = old_ptr;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Read numeric data from raw and tagged kernel status files. Used to read
+ * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
+ */
+static long file_read_ulong(char *file, const char *tag)
+{
+ int fd;
+ char buf[2048];
+ int len;
+ char *p, *q;
+ long val;
+
+ fd = open(file, O_RDONLY);
+ if (fd < 0) {
+ /* Error opening the file */
+ return -1;
+ }
+
+ len = read(fd, buf, sizeof(buf));
+ close(fd);
+ if (len < 0) {
+ /* Error in reading the file */
+ return -1;
+ }
+ if (len == sizeof(buf)) {
+ /* Error file is too large */
+ return -1;
+ }
+ buf[len] = '\0';
+
+ /* Search for a tag if provided */
+ if (tag) {
+ p = strstr(buf, tag);
+ if (!p)
+ return -1; /* looks like the line we want isn't there */
+ p += strlen(tag);
+ } else
+ p = buf;
+
+ val = strtol(p, &q, 0);
+ if (*q != ' ') {
+ /* Error parsing the file */
+ return -1;
+ }
+
+ return val;
+}
+
+/*
+ * Write huge TLBFS page.
+ */
+TEST_F(hmm, anon_write_hugetlbfs)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long default_hsize;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+ if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+ SKIP(return, "Huge page size could not be determined");
+ default_hsize = default_hsize*1024; /* KB to B */
+
+ size = ALIGN(TWOMEG, default_hsize);
+ npages = size >> self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (buffer->ptr == MAP_FAILED) {
+ free(buffer);
+ SKIP(return, "Huge page could not be allocated");
+ }
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ munmap(buffer->ptr, buffer->size);
+ buffer->ptr = NULL;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Read mmap'ed file memory.
+ */
+TEST_F(hmm, file_read)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ int fd;
+ ssize_t len;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ fd = hmm_create_file(size);
+ ASSERT_GE(fd, 0);
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = fd;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Write initial contents of the file. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+ len = pwrite(fd, buffer->mirror, size, 0);
+ ASSERT_EQ(len, size);
+ memset(buffer->mirror, 0, size);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ,
+ MAP_SHARED,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write mmap'ed file memory.
+ */
+TEST_F(hmm, file_write)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ int fd;
+ ssize_t len;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ fd = hmm_create_file(size);
+ ASSERT_GE(fd, 0);
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = fd;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Check that the device also wrote the file. */
+ len = pread(fd, buffer->mirror, size, 0);
+ ASSERT_EQ(len, size);
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory.
+ */
+TEST_F(hmm, migrate)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory and fault some of it back
+ * to system memory, then try migrating the resulting mix of system and device
+ * private memory to the device.
+ */
+TEST_F(hmm, migrate_fault)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Fault half the pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Migrate memory to the device again. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, migrate_release)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Release device memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous shared memory to device private memory.
+ */
+TEST_F(hmm, migrate_shared)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, -ENOENT);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Try to migrate various memory types to device private memory.
+ */
+TEST_F(hmm2, migrate_mixed)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int *ptr;
+ unsigned char *p;
+ int ret;
+ int val;
+
+ npages = 6;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+ p = buffer->ptr;
+
+ /* Migrating a protected area should be an error. */
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
+ ASSERT_EQ(ret, -EINVAL);
+
+ /* Punch a hole after the first page address. */
+ ret = munmap(buffer->ptr + self->page_size, self->page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* We expect an error if the vma doesn't cover the range. */
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3);
+ ASSERT_EQ(ret, -EINVAL);
+
+ /* Page 2 will be a read-only zero page. */
+ ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 2 * self->page_size);
+ val = *ptr + 3;
+ ASSERT_EQ(val, 3);
+
+ /* Page 3 will be read-only. */
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 3 * self->page_size);
+ *ptr = val;
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Page 4-5 will be read-write. */
+ ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 4 * self->page_size);
+ *ptr = val;
+ ptr = (int *)(buffer->ptr + 5 * self->page_size);
+ *ptr = val;
+
+ /* Now try to migrate pages 2-5 to device 1. */
+ buffer->ptr = p + 2 * self->page_size;
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 4);
+
+ /* Page 5 won't be migrated to device 0 because it's on device 1. */
+ buffer->ptr = p + 5 * self->page_size;
+ ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
+ ASSERT_EQ(ret, -ENOENT);
+ buffer->ptr = p;
+
+ buffer->ptr = p;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device memory and back to system memory
+ * multiple times. In case of private zone configuration, this is done
+ * through fault pages accessed by CPU. In case of coherent zone configuration,
+ * the pages from the device should be explicitly migrated back to system memory.
+ * The reason is Coherent device zone has coherent access by CPU, therefore
+ * it will not generate any page fault.
+ */
+TEST_F(hmm, migrate_multiple)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ unsigned long c;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ for (c = 0; c < NTIMES; c++) {
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Migrate back to system memory and check them. */
+ if (hmm_is_coherent_type(variant->device_number)) {
+ ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ }
+
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+ }
+}
+
+/*
+ * Read anonymous memory multiple times.
+ */
+TEST_F(hmm, anon_read_multiple)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ unsigned long c;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ for (c = 0; c < NTIMES; c++) {
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i + c;
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+ npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i + c);
+
+ hmm_buffer_free(buffer);
+ }
+}
+
+void *unmap_buffer(void *p)
+{
+ struct hmm_buffer *buffer = p;
+
+ /* Delay for a bit and then unmap buffer while it is being read. */
+ hmm_nanosleep(hmm_random() % 32000);
+ munmap(buffer->ptr + buffer->size / 2, buffer->size / 2);
+ buffer->ptr = NULL;
+
+ return NULL;
+}
+
+/*
+ * Try reading anonymous memory while it is being unmapped.
+ */
+TEST_F(hmm, anon_teardown)
+{
+ unsigned long npages;
+ unsigned long size;
+ unsigned long c;
+ void *ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ for (c = 0; c < NTIMES; ++c) {
+ pthread_t thread;
+ struct hmm_buffer *buffer;
+ unsigned long i;
+ int *ptr;
+ int rc;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i + c;
+
+ rc = pthread_create(&thread, NULL, unmap_buffer, buffer);
+ ASSERT_EQ(rc, 0);
+
+ /* Simulate a device reading system memory. */
+ rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+ npages);
+ if (rc == 0) {
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror;
+ i < size / sizeof(*ptr);
+ ++i)
+ ASSERT_EQ(ptr[i], i + c);
+ }
+
+ pthread_join(thread, &ret);
+ hmm_buffer_free(buffer);
+ }
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned char *m;
+ int ret;
+
+ npages = 1;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ self->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm2, snapshot)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int *ptr;
+ unsigned char *p;
+ unsigned char *m;
+ int ret;
+ int val;
+
+ npages = 7;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+ p = buffer->ptr;
+
+ /* Punch a hole after the first page address. */
+ ret = munmap(buffer->ptr + self->page_size, self->page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* Page 2 will be read-only zero page. */
+ ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 2 * self->page_size);
+ val = *ptr + 3;
+ ASSERT_EQ(val, 3);
+
+ /* Page 3 will be read-only. */
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 3 * self->page_size);
+ *ptr = val;
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Page 4-6 will be read-write. */
+ ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 4 * self->page_size);
+ *ptr = val;
+
+ /* Page 5 will be migrated to device 0. */
+ buffer->ptr = p + 5 * self->page_size;
+ ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 1);
+
+ /* Page 6 will be migrated to device 1. */
+ buffer->ptr = p + 6 * self->page_size;
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 1);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ buffer->ptr = p;
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
+ ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
+ ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
+ ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
+ ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
+ if (!hmm_is_coherent_type(variant->device_number0)) {
+ ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
+ HMM_DMIRROR_PROT_WRITE);
+ ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
+ } else {
+ ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL |
+ HMM_DMIRROR_PROT_WRITE);
+ ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE |
+ HMM_DMIRROR_PROT_WRITE);
+ }
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
+ * should be mapped by a large page table entry.
+ */
+TEST_F(hmm, compound)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long default_hsize;
+ int *ptr;
+ unsigned char *m;
+ int ret;
+ unsigned long i;
+
+ /* Skip test if we can't allocate a hugetlbfs page. */
+
+ default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+ if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+ SKIP(return, "Huge page size could not be determined");
+ default_hsize = default_hsize*1024; /* KB to B */
+
+ size = ALIGN(TWOMEG, default_hsize);
+ npages = size >> self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (buffer->ptr == MAP_FAILED) {
+ free(buffer);
+ return;
+ }
+
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Initialize the pages the device will snapshot in buffer->ptr. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ for (i = 0; i < npages; ++i)
+ ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
+ HMM_DMIRROR_PROT_PMD);
+
+ /* Make the region read-only. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ for (i = 0; i < npages; ++i)
+ ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
+ HMM_DMIRROR_PROT_PMD);
+
+ munmap(buffer->ptr, buffer->size);
+ buffer->ptr = NULL;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test two devices reading the same memory (double mapped).
+ */
+TEST_F(hmm2, double_map)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = 6;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Make region read-only. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate device 0 reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Simulate device 1 reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Migrate pages to device 1 and try to read from device 0. */
+ ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what device 0 read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Basic check of exclusive faulting.
+ */
+TEST_F(hmm, exclusive)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Map memory exclusively for device access. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i]++, i);
+
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i+1);
+
+ /* Check atomic access revoked */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+
+ hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, exclusive_mprotect)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Map memory exclusively for device access. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, -EPERM);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Check copy-on-write works.
+ */
+TEST_F(hmm, exclusive_cow)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Map memory exclusively for device access. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ fork();
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i]++, i);
+
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i+1);
+
+ hmm_buffer_free(buffer);
+}
+
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+ int npages, int size, int flags)
+{
+ struct gup_test gup = {
+ .nr_pages_per_call = npages,
+ .addr = addr,
+ .gup_flags = FOLL_WRITE | flags,
+ .size = size,
+ };
+
+ if (ioctl(gup_fd, cmd, &gup)) {
+ perror("ioctl on error\n");
+ return errno;
+ }
+
+ return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+ struct hmm_buffer *buffer;
+ int gup_fd;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ unsigned char *m;
+
+ gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ if (gup_fd == -1)
+ SKIP(return, "Skipping test, could not find gup_test driver");
+
+ npages = 4;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ ASSERT_EQ(gup_test_exec(gup_fd,
+ (unsigned long)buffer->ptr,
+ GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+ ASSERT_EQ(gup_test_exec(gup_fd,
+ (unsigned long)buffer->ptr + 1 * self->page_size,
+ GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+ ASSERT_EQ(gup_test_exec(gup_fd,
+ (unsigned long)buffer->ptr + 2 * self->page_size,
+ PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0);
+ ASSERT_EQ(gup_test_exec(gup_fd,
+ (unsigned long)buffer->ptr + 3 * self->page_size,
+ PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0);
+
+ /* Take snapshot to CPU pagetables */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ m = buffer->mirror;
+ if (hmm_is_coherent_type(variant->device_number)) {
+ ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]);
+ ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]);
+ } else {
+ ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+ ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+ }
+ ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]);
+ ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]);
+ /*
+ * Check again the content on the pages. Make sure there's no
+ * corrupted data.
+ */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ close(gup_fd);
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ unsigned char *m;
+ pid_t pid;
+ int status;
+
+ npages = 4;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+
+ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ pid = fork();
+ if (pid == -1)
+ ASSERT_EQ(pid, 0);
+ if (!pid) {
+ /* Child process waitd for SIGTERM from the parent. */
+ while (1) {
+ }
+ perror("Should not reach this\n");
+ exit(0);
+ }
+ /* Parent process writes to COW pages(s) and gets a
+ * new copy in system. In case of device private pages,
+ * this write causes a migration to system mem first.
+ */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Terminate child and wait */
+ EXPECT_EQ(0, kill(pid, SIGTERM));
+ EXPECT_EQ(pid, waitpid(pid, &status, 0));
+ EXPECT_NE(0, WIFSIGNALED(status));
+ EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+ /* Take snapshot to CPU pagetables */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ m = buffer->mirror;
+ for (i = 0; i < npages; i++)
+ ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+ hmm_buffer_free(buffer);
+}
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c
new file mode 100644
index 000000000000..3b1b532f1cbb
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-mmap.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call. Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include "../kselftest.h"
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+static void check_bytes(char *addr)
+{
+ ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < LENGTH; i++)
+ if (*(addr + i) != (char)i) {
+ ksft_print_msg("Error: Mismatch at %lu\n", i);
+ return 1;
+ }
+ return 0;
+}
+
+int main(void)
+{
+ void *addr;
+ int fd, ret;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ fd = memfd_create("hugepage-mmap", MFD_HUGETLB);
+ if (fd < 0)
+ ksft_exit_fail_msg("memfd_create() failed: %s\n", strerror(errno));
+
+ addr = mmap(NULL, LENGTH, PROTECTION, MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ close(fd);
+ ksft_exit_fail_msg("mmap(): %s\n", strerror(errno));
+ }
+
+ ksft_print_msg("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr);
+ ret = read_bytes(addr);
+
+ munmap(addr, LENGTH);
+ close(fd);
+
+ ksft_test_result(!ret, "Read same data\n");
+
+ ksft_exit(!ret);
+}
diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
new file mode 100644
index 000000000000..c463d1c09c9b
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mremap:
+ *
+ * Example of remapping huge page memory in a user application using the
+ * mremap system call. The path to a file in a hugetlbfs filesystem must
+ * be passed as the last argument to this test. The amount of memory used
+ * by this test in MBs can optionally be passed as an argument. If no memory
+ * amount is passed, the default amount is 10MB.
+ *
+ * To make sure the test triggers pmd sharing and goes through the 'unshare'
+ * path in the mremap code use 1GB (1024) or more.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <fcntl.h> /* Definition of O_* constants */
+#include <sys/syscall.h> /* Definition of SYS_* constants */
+#include <linux/userfaultfd.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <stdbool.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define DEFAULT_LENGTH_MB 10UL
+#define MB_TO_BYTES(x) (x * 1024 * 1024)
+
+#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
+#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
+
+static void check_bytes(char *addr)
+{
+ ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t len)
+{
+ unsigned long i;
+
+ for (i = 0; i < len; i++)
+ *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr, size_t len)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < len; i++)
+ if (*(addr + i) != (char)i) {
+ ksft_print_msg("Mismatch at %lu\n", i);
+ return 1;
+ }
+ return 0;
+}
+
+static void register_region_with_uffd(char *addr, size_t len)
+{
+ long uffd; /* userfaultfd file descriptor */
+ struct uffdio_api uffdio_api;
+
+ /* Create and enable userfaultfd object. */
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == -1)
+ ksft_exit_fail_msg("userfaultfd: %s\n", strerror(errno));
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
+ ksft_exit_fail_msg("ioctl-UFFDIO_API: %s\n", strerror(errno));
+
+ /* Create a private anonymous mapping. The memory will be
+ * demand-zero paged--that is, not yet allocated. When we
+ * actually touch the memory, it will be allocated via
+ * the userfaultfd.
+ */
+
+ addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+
+ ksft_print_msg("Address returned by mmap() = %p\n", addr);
+
+ /* Register the memory range of the mapping we just created for
+ * handling by the userfaultfd object. In mode, we request to track
+ * missing pages (i.e., pages that have not yet been faulted in).
+ */
+ if (uffd_register(uffd, addr, len, true, false, false))
+ ksft_exit_fail_msg("ioctl-UFFDIO_REGISTER: %s\n", strerror(errno));
+}
+
+int main(int argc, char *argv[])
+{
+ size_t length = 0;
+ int ret = 0, fd;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ if (argc >= 2 && !strcmp(argv[1], "-h"))
+ ksft_exit_fail_msg("Usage: %s [length_in_MB]\n", argv[0]);
+
+ /* Read memory length as the first arg if valid, otherwise fallback to
+ * the default length.
+ */
+ if (argc >= 2)
+ length = (size_t)atoi(argv[1]);
+ else
+ length = DEFAULT_LENGTH_MB;
+
+ length = MB_TO_BYTES(length);
+ fd = memfd_create(argv[0], MFD_HUGETLB);
+ if (fd < 0)
+ ksft_exit_fail_msg("Open failed: %s\n", strerror(errno));
+
+ /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
+ unsigned long suggested_addr = 0x7eaa40000000;
+ void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
+ MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+ ksft_print_msg("Map haddr: Returned address is %p\n", haddr);
+ if (haddr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap1: %s\n", strerror(errno));
+
+ /* mmap again to a dummy address to hopefully trigger pmd sharing. */
+ suggested_addr = 0x7daa40000000;
+ void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
+ MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+ ksft_print_msg("Map daddr: Returned address is %p\n", daddr);
+ if (daddr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap3: %s\n", strerror(errno));
+
+ suggested_addr = 0x7faa40000000;
+ void *vaddr =
+ mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
+ ksft_print_msg("Map vaddr: Returned address is %p\n", vaddr);
+ if (vaddr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap2: %s\n", strerror(errno));
+
+ register_region_with_uffd(haddr, length);
+
+ void *addr = mremap(haddr, length, length,
+ MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mremap: %s\n", strerror(errno));
+
+ ksft_print_msg("Mremap: Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr, length);
+ ret = read_bytes(addr, length);
+
+ munmap(addr, length);
+
+ addr = mremap(addr, length, length, 0);
+ if (addr != MAP_FAILED)
+ ksft_exit_fail_msg("mremap: Expected failure, but call succeeded\n");
+
+ close(fd);
+
+ ksft_test_result(!ret, "Read same data\n");
+ ksft_exit(!ret);
+}
diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c
new file mode 100644
index 000000000000..ef06260802b5
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-shm.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls. In this example the app is requesting 256MB of
+ * memory that is backed by huge pages. The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x) printf(x)
+
+int main(void)
+{
+ int shmid;
+ unsigned long i;
+ char *shmaddr;
+
+ shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0) {
+ perror("shmget");
+ exit(1);
+ }
+ printf("shmid: 0x%x\n", shmid);
+
+ shmaddr = shmat(shmid, NULL, 0);
+ if (shmaddr == (char *)-1) {
+ perror("Shared memory attach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(2);
+ }
+ printf("shmaddr: %p\n", shmaddr);
+
+ dprintf("Starting the writes:\n");
+ for (i = 0; i < LENGTH; i++) {
+ shmaddr[i] = (char)(i);
+ if (!(i % (1024 * 1024)))
+ dprintf(".");
+ }
+ dprintf("\n");
+
+ dprintf("Starting the Check...");
+ for (i = 0; i < LENGTH; i++)
+ if (shmaddr[i] != (char)i) {
+ printf("\nIndex %lu mismatched\n", i);
+ exit(3);
+ }
+ dprintf("Done.\n");
+
+ if (shmdt((const void *)shmaddr) != 0) {
+ perror("Detach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(4);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c
new file mode 100644
index 000000000000..df366a4d1b92
--- /dev/null
+++ b/tools/testing/selftests/mm/hugepage-vmemmap.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case of using hugepage memory in a user application using the
+ * mmap system call with MAP_HUGETLB flag. Before running this program
+ * make sure the administrator has allocated enough default sized huge
+ * pages to cover the 2 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include "vm_util.h"
+
+#define PAGE_COMPOUND_HEAD (1UL << 15)
+#define PAGE_COMPOUND_TAIL (1UL << 16)
+#define PAGE_HUGE (1UL << 17)
+
+#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE)
+#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE)
+
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1)
+
+static size_t pagesize;
+static size_t maplength;
+
+static void write_bytes(char *addr, size_t length)
+{
+ unsigned long i;
+
+ for (i = 0; i < length; i++)
+ *(addr + i) = (char)i;
+}
+
+static unsigned long virt_to_pfn(void *addr)
+{
+ int fd;
+ unsigned long pagemap;
+
+ fd = open("/proc/self/pagemap", O_RDONLY);
+ if (fd < 0)
+ return -1UL;
+
+ lseek(fd, (unsigned long)addr / pagesize * sizeof(pagemap), SEEK_SET);
+ read(fd, &pagemap, sizeof(pagemap));
+ close(fd);
+
+ return pagemap & ~PM_PFRAME_MASK;
+}
+
+static int check_page_flags(unsigned long pfn)
+{
+ int fd, i;
+ unsigned long pageflags;
+
+ fd = open("/proc/kpageflags", O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
+
+ read(fd, &pageflags, sizeof(pageflags));
+ if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
+ close(fd);
+ printf("Head page flags (%lx) is invalid\n", pageflags);
+ return -1;
+ }
+
+ /*
+ * pages other than the first page must be tail and shouldn't be head;
+ * this also verifies kernel has correctly set the fake page_head to tail
+ * while hugetlb_free_vmemmap is enabled.
+ */
+ for (i = 1; i < maplength / pagesize; i++) {
+ read(fd, &pageflags, sizeof(pageflags));
+ if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
+ (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
+ close(fd);
+ printf("Tail page flags (%lx) is invalid\n", pageflags);
+ return -1;
+ }
+ }
+
+ close(fd);
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ void *addr;
+ unsigned long pfn;
+
+ pagesize = psize();
+ maplength = default_huge_page_size();
+ if (!maplength) {
+ printf("Unable to determine huge page size\n");
+ exit(1);
+ }
+
+ addr = mmap(NULL, maplength, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* Trigger allocation of HugeTLB page. */
+ write_bytes(addr, maplength);
+
+ pfn = virt_to_pfn(addr);
+ if (pfn == -1UL) {
+ munmap(addr, maplength);
+ perror("virt_to_pfn");
+ exit(1);
+ }
+
+ printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+
+ if (check_page_flags(pfn) < 0) {
+ munmap(addr, maplength);
+ perror("check_page_flags");
+ exit(1);
+ }
+
+ /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+ if (munmap(addr, maplength)) {
+ perror("munmap");
+ exit(1);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
new file mode 100644
index 000000000000..e74107185324
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-madvise:
+ *
+ * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE
+ * on hugetlb mappings.
+ *
+ * Before running this test, make sure the administrator has pre-allocated
+ * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition,
+ * the test takes an argument that is the path to a file in a hugetlbfs
+ * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some
+ * directory.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define MIN_FREE_PAGES 20
+#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */
+
+#define validate_free_pages(exp_free) \
+ do { \
+ int fhp = get_free_hugepages(); \
+ if (fhp != (exp_free)) { \
+ printf("Unexpected number of free huge " \
+ "pages line %d\n", __LINE__); \
+ exit(1); \
+ } \
+ } while (0)
+
+unsigned long huge_page_size;
+unsigned long base_page_size;
+
+void write_fault_pages(void *addr, unsigned long nr_pages)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++)
+ *((unsigned long *)(addr + (i * huge_page_size))) = i;
+}
+
+void read_fault_pages(void *addr, unsigned long nr_pages)
+{
+ volatile unsigned long dummy = 0;
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++) {
+ dummy += *((unsigned long *)(addr + (i * huge_page_size)));
+
+ /* Prevent the compiler from optimizing out the entire loop: */
+ asm volatile("" : "+r" (dummy));
+ }
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long free_hugepages;
+ void *addr, *addr2;
+ int fd;
+ int ret;
+
+ huge_page_size = default_huge_page_size();
+ if (!huge_page_size) {
+ printf("Unable to determine huge page size, exiting!\n");
+ exit(1);
+ }
+ base_page_size = sysconf(_SC_PAGE_SIZE);
+ if (!huge_page_size) {
+ printf("Unable to determine base page size, exiting!\n");
+ exit(1);
+ }
+
+ free_hugepages = get_free_hugepages();
+ if (free_hugepages < MIN_FREE_PAGES) {
+ printf("Not enough free huge pages to test, exiting!\n");
+ exit(KSFT_SKIP);
+ }
+
+ fd = memfd_create(argv[0], MFD_HUGETLB);
+ if (fd < 0) {
+ perror("memfd_create() failed");
+ exit(1);
+ }
+
+ /*
+ * Test validity of MADV_DONTNEED addr and length arguments. mmap
+ * size is NR_HUGE_PAGES + 2. One page at the beginning and end of
+ * the mapping will be unmapped so we KNOW there is nothing mapped
+ * there.
+ */
+ addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+ if (munmap(addr, huge_page_size) ||
+ munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
+ huge_page_size)) {
+ perror("munmap");
+ exit(1);
+ }
+ addr = addr + huge_page_size;
+
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* addr before mapping should fail */
+ ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
+ MADV_DONTNEED);
+ if (!ret) {
+ printf("Unexpected success of madvise call with invalid addr line %d\n",
+ __LINE__);
+ exit(1);
+ }
+
+ /* addr + length after mapping should fail */
+ ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
+ MADV_DONTNEED);
+ if (!ret) {
+ printf("Unexpected success of madvise call with invalid length line %d\n",
+ __LINE__);
+ exit(1);
+ }
+
+ (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+ /*
+ * Test alignment of MADV_DONTNEED addr and length arguments
+ */
+ addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* addr is not huge page size aligned and should fail */
+ ret = madvise(addr + base_page_size,
+ NR_HUGE_PAGES * huge_page_size - base_page_size,
+ MADV_DONTNEED);
+ if (!ret) {
+ printf("Unexpected success of madvise call with unaligned start address %d\n",
+ __LINE__);
+ exit(1);
+ }
+
+ /* addr + length should be aligned down to huge page size */
+ if (madvise(addr,
+ ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
+ MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+
+ /* should free all but last page in mapping */
+ validate_free_pages(free_hugepages - 1);
+
+ (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+ validate_free_pages(free_hugepages);
+
+ /*
+ * Test MADV_DONTNEED on anonymous private mapping
+ */
+ addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+
+ /* should free all pages in mapping */
+ validate_free_pages(free_hugepages);
+
+ (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+ /*
+ * Test MADV_DONTNEED on private mapping of hugetlb file
+ */
+ if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+ perror("fallocate");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* read should not consume any pages */
+ read_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* madvise should not free any pages */
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* writes should allocate private pages */
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+ /* madvise should free private pages */
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* writes should allocate private pages */
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+ /*
+ * The fallocate below certainly should free the pages associated
+ * with the file. However, pages in the private mapping are also
+ * freed. This is not the 'correct' behavior, but is expected
+ * because this is how it has worked since the initial hugetlb
+ * implementation.
+ */
+ if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ 0, NR_HUGE_PAGES * huge_page_size)) {
+ perror("fallocate");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages);
+
+ (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+ /*
+ * Test MADV_DONTNEED on shared mapping of hugetlb file
+ */
+ if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+ perror("fallocate");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* write should not consume any pages */
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* madvise should not free any pages */
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /*
+ * Test MADV_REMOVE on shared mapping of hugetlb file
+ *
+ * madvise is same as hole punch and should free all pages.
+ */
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages);
+ (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+ /*
+ * Test MADV_REMOVE on shared and private mapping of hugetlb file
+ */
+ if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+ perror("fallocate");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* shared write should not consume any additional pages */
+ write_fault_pages(addr, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE, fd, 0);
+ if (addr2 == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ /* private read should not consume any pages */
+ read_fault_pages(addr2, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* private write should consume additional pages */
+ write_fault_pages(addr2, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+ /* madvise of shared mapping should not free any pages */
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+ /* madvise of private mapping should free private pages */
+ if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+ /* private write should consume additional pages again */
+ write_fault_pages(addr2, NR_HUGE_PAGES);
+ validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+ /*
+ * madvise should free both file and private pages although this is
+ * not correct. private pages should not be freed, but this is
+ * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call.
+ */
+ if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+ perror("madvise");
+ exit(1);
+ }
+ validate_free_pages(free_hugepages);
+
+ (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+ (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
+
+ close(fd);
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
new file mode 100644
index 000000000000..ba6cc6f9cabc
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <linux/magic.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#define PREFIX " ... "
+#define ERROR_PREFIX " !!! "
+
+#define MAX_WRITE_READ_CHUNK_SIZE (getpagesize() * 16)
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+enum test_status {
+ TEST_PASSED = 0,
+ TEST_FAILED = 1,
+ TEST_SKIPPED = 2,
+};
+
+static char *status_to_str(enum test_status status)
+{
+ switch (status) {
+ case TEST_PASSED:
+ return "TEST_PASSED";
+ case TEST_FAILED:
+ return "TEST_FAILED";
+ case TEST_SKIPPED:
+ return "TEST_SKIPPED";
+ default:
+ return "TEST_???";
+ }
+}
+
+static int setup_filemap(char *filemap, size_t len, size_t wr_chunk_size)
+{
+ char iter = 0;
+
+ for (size_t offset = 0; offset < len;
+ offset += wr_chunk_size) {
+ iter++;
+ memset(filemap + offset, iter, wr_chunk_size);
+ }
+
+ return 0;
+}
+
+static bool verify_chunk(char *buf, size_t len, char val)
+{
+ size_t i;
+
+ for (i = 0; i < len; ++i) {
+ if (buf[i] != val) {
+ printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n",
+ i, buf[i], val);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size,
+ off_t offset, size_t expected)
+{
+ char buf[MAX_WRITE_READ_CHUNK_SIZE];
+ ssize_t ret_count = 0;
+ ssize_t total_ret_count = 0;
+ char val = offset / wr_chunk_size + offset % wr_chunk_size;
+
+ printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset);
+ printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+ expected);
+ if (lseek(fd, offset, SEEK_SET) < 0) {
+ perror(PREFIX ERROR_PREFIX "seek failed");
+ return false;
+ }
+
+ while (offset + total_ret_count < len) {
+ ret_count = read(fd, buf, wr_chunk_size);
+ if (ret_count == 0) {
+ printf(PREFIX PREFIX "read reach end of the file\n");
+ break;
+ } else if (ret_count < 0) {
+ perror(PREFIX ERROR_PREFIX "read failed");
+ break;
+ }
+ ++val;
+ if (!verify_chunk(buf, ret_count, val))
+ return false;
+
+ total_ret_count += ret_count;
+ }
+ printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+ total_ret_count);
+
+ return total_ret_count == expected;
+}
+
+static bool read_hugepage_filemap(int fd, size_t len,
+ size_t wr_chunk_size, size_t expected)
+{
+ char buf[MAX_WRITE_READ_CHUNK_SIZE];
+ ssize_t ret_count = 0;
+ ssize_t total_ret_count = 0;
+ char val = 0;
+
+ printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+ expected);
+ while (total_ret_count < len) {
+ ret_count = read(fd, buf, wr_chunk_size);
+ if (ret_count == 0) {
+ printf(PREFIX PREFIX "read reach end of the file\n");
+ break;
+ } else if (ret_count < 0) {
+ perror(PREFIX ERROR_PREFIX "read failed");
+ break;
+ }
+ ++val;
+ if (!verify_chunk(buf, ret_count, val))
+ return false;
+
+ total_ret_count += ret_count;
+ }
+ printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+ total_ret_count);
+
+ return total_ret_count == expected;
+}
+
+static enum test_status
+test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size)
+{
+ enum test_status status = TEST_SKIPPED;
+ char *filemap = NULL;
+
+ if (ftruncate(fd, len) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate failed");
+ return status;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+ goto done;
+ }
+
+ setup_filemap(filemap, len, wr_chunk_size);
+ status = TEST_FAILED;
+
+ if (read_hugepage_filemap(fd, len, wr_chunk_size, len))
+ status = TEST_PASSED;
+
+ munmap(filemap, len);
+done:
+ if (ftruncate(fd, 0) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+ status = TEST_FAILED;
+ }
+
+ return status;
+}
+
+static enum test_status
+test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size,
+ bool skip_hwpoison_page)
+{
+ enum test_status status = TEST_SKIPPED;
+ char *filemap = NULL;
+ char *hwp_addr = NULL;
+ const unsigned long pagesize = getpagesize();
+
+ if (ftruncate(fd, len) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate failed");
+ return status;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+ goto done;
+ }
+
+ setup_filemap(filemap, len, wr_chunk_size);
+ status = TEST_FAILED;
+
+ /*
+ * Poisoned hugetlb page layout (assume hugepagesize=2MB):
+ * |<---------------------- 1MB ---------------------->|
+ * |<---- healthy page ---->|<---- HWPOISON page ----->|
+ * |<------------------- (1MB - 8KB) ----------------->|
+ */
+ hwp_addr = filemap + len / 2 + pagesize;
+ if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) {
+ perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed");
+ goto unmap;
+ }
+
+ if (!skip_hwpoison_page) {
+ /*
+ * Userspace should be able to read (1MB + 1 page) from
+ * the beginning of the HWPOISONed hugepage.
+ */
+ if (read_hugepage_filemap(fd, len, wr_chunk_size,
+ len / 2 + pagesize))
+ status = TEST_PASSED;
+ } else {
+ /*
+ * Userspace should be able to read (1MB - 2 pages) from
+ * HWPOISONed hugepage.
+ */
+ if (seek_read_hugepage_filemap(fd, len, wr_chunk_size,
+ len / 2 + MAX(2 * pagesize, wr_chunk_size),
+ len / 2 - MAX(2 * pagesize, wr_chunk_size)))
+ status = TEST_PASSED;
+ }
+
+unmap:
+ munmap(filemap, len);
+done:
+ if (ftruncate(fd, 0) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+ status = TEST_FAILED;
+ }
+
+ return status;
+}
+
+static int create_hugetlbfs_file(struct statfs *file_stat)
+{
+ int fd;
+
+ fd = memfd_create("hugetlb_tmp", MFD_HUGETLB);
+ if (fd < 0) {
+ perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file");
+ return -1;
+ }
+
+ memset(file_stat, 0, sizeof(*file_stat));
+ if (fstatfs(fd, file_stat)) {
+ perror(PREFIX ERROR_PREFIX "fstatfs failed");
+ goto close;
+ }
+ if (file_stat->f_type != HUGETLBFS_MAGIC) {
+ printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n");
+ goto close;
+ }
+
+ return fd;
+close:
+ close(fd);
+ return -1;
+}
+
+int main(void)
+{
+ int fd;
+ struct statfs file_stat;
+ enum test_status status;
+ /* Test read() in different granularity. */
+ size_t wr_chunk_sizes[] = {
+ getpagesize() / 2, getpagesize(),
+ getpagesize() * 2, getpagesize() * 4
+ };
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) {
+ printf("Write/read chunk size=0x%lx\n",
+ wr_chunk_sizes[i]);
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB read regression test...\n");
+ status = test_hugetlb_read(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i]);
+ printf(PREFIX "HugeTLB read regression test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB read HWPOISON test...\n");
+ status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i], false);
+ printf(PREFIX "HugeTLB read HWPOISON test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB seek then read HWPOISON test...\n");
+ status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i], true);
+ printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+ }
+
+ return 0;
+
+create_failure:
+ printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n");
+ return -1;
+}
diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c
new file mode 100644
index 000000000000..f086f0e04756
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test soft offline behavior for HugeTLB pages:
+ * - if enable_soft_offline = 0, hugepages should stay intact and soft
+ * offlining failed with EOPNOTSUPP.
+ * - if enable_soft_offline = 1, a hugepage should be dissolved and
+ * nr_hugepages/free_hugepages should be reduced by 1.
+ *
+ * Before running, make sure more than 2 hugepages of default_hugepagesz
+ * are allocated. For example, if /proc/meminfo/Hugepagesize is 2048kB:
+ * echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/magic.h>
+#include <linux/memfd.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+
+#include "../kselftest.h"
+
+#ifndef MADV_SOFT_OFFLINE
+#define MADV_SOFT_OFFLINE 101
+#endif
+
+#define EPREFIX " !!! "
+
+static int do_soft_offline(int fd, size_t len, int expect_errno)
+{
+ char *filemap = NULL;
+ char *hwp_addr = NULL;
+ const unsigned long pagesize = getpagesize();
+ int ret = 0;
+
+ if (ftruncate(fd, len) < 0) {
+ ksft_perror(EPREFIX "ftruncate to len failed");
+ return -1;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ ksft_perror(EPREFIX "mmap failed");
+ ret = -1;
+ goto untruncate;
+ }
+
+ memset(filemap, 0xab, len);
+ ksft_print_msg("Allocated %#lx bytes of hugetlb pages\n", len);
+
+ hwp_addr = filemap + len / 2;
+ ret = madvise(hwp_addr, pagesize, MADV_SOFT_OFFLINE);
+ ksft_print_msg("MADV_SOFT_OFFLINE %p ret=%d, errno=%d\n",
+ hwp_addr, ret, errno);
+ if (ret != 0)
+ ksft_perror(EPREFIX "madvise failed");
+
+ if (errno == expect_errno)
+ ret = 0;
+ else {
+ ksft_print_msg("MADV_SOFT_OFFLINE should ret %d\n",
+ expect_errno);
+ ret = -1;
+ }
+
+ munmap(filemap, len);
+untruncate:
+ if (ftruncate(fd, 0) < 0)
+ ksft_perror(EPREFIX "ftruncate back to 0 failed");
+
+ return ret;
+}
+
+static int set_enable_soft_offline(int value)
+{
+ char cmd[256] = {0};
+ FILE *cmdfile = NULL;
+
+ if (value != 0 && value != 1)
+ return -EINVAL;
+
+ sprintf(cmd, "echo %d > /proc/sys/vm/enable_soft_offline", value);
+ cmdfile = popen(cmd, "r");
+
+ if (cmdfile)
+ ksft_print_msg("enable_soft_offline => %d\n", value);
+ else {
+ ksft_perror(EPREFIX "failed to set enable_soft_offline");
+ return errno;
+ }
+
+ pclose(cmdfile);
+ return 0;
+}
+
+static int read_nr_hugepages(unsigned long hugepage_size,
+ unsigned long *nr_hugepages)
+{
+ char buffer[256] = {0};
+ char cmd[256] = {0};
+
+ sprintf(cmd, "cat /sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages",
+ hugepage_size);
+ FILE *cmdfile = popen(cmd, "r");
+
+ if (cmdfile == NULL) {
+ ksft_perror(EPREFIX "failed to popen nr_hugepages");
+ return -1;
+ }
+
+ if (!fgets(buffer, sizeof(buffer), cmdfile)) {
+ ksft_perror(EPREFIX "failed to read nr_hugepages");
+ pclose(cmdfile);
+ return -1;
+ }
+
+ *nr_hugepages = atoll(buffer);
+ pclose(cmdfile);
+ return 0;
+}
+
+static int create_hugetlbfs_file(struct statfs *file_stat)
+{
+ int fd;
+
+ fd = memfd_create("hugetlb_tmp", MFD_HUGETLB);
+ if (fd < 0) {
+ ksft_perror(EPREFIX "could not open hugetlbfs file");
+ return -1;
+ }
+
+ memset(file_stat, 0, sizeof(*file_stat));
+ if (fstatfs(fd, file_stat)) {
+ ksft_perror(EPREFIX "fstatfs failed");
+ goto close;
+ }
+ if (file_stat->f_type != HUGETLBFS_MAGIC) {
+ ksft_print_msg(EPREFIX "not hugetlbfs file\n");
+ goto close;
+ }
+
+ return fd;
+close:
+ close(fd);
+ return -1;
+}
+
+static void test_soft_offline_common(int enable_soft_offline)
+{
+ int fd;
+ int expect_errno = enable_soft_offline ? 0 : EOPNOTSUPP;
+ struct statfs file_stat;
+ unsigned long hugepagesize_kb = 0;
+ unsigned long nr_hugepages_before = 0;
+ unsigned long nr_hugepages_after = 0;
+ int ret;
+
+ ksft_print_msg("Test soft-offline when enabled_soft_offline=%d\n",
+ enable_soft_offline);
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ ksft_exit_fail_msg("Failed to create hugetlbfs file\n");
+
+ hugepagesize_kb = file_stat.f_bsize / 1024;
+ ksft_print_msg("Hugepagesize is %ldkB\n", hugepagesize_kb);
+
+ if (set_enable_soft_offline(enable_soft_offline) != 0) {
+ close(fd);
+ ksft_exit_fail_msg("Failed to set enable_soft_offline\n");
+ }
+
+ if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_before) != 0) {
+ close(fd);
+ ksft_exit_fail_msg("Failed to read nr_hugepages\n");
+ }
+
+ ksft_print_msg("Before MADV_SOFT_OFFLINE nr_hugepages=%ld\n",
+ nr_hugepages_before);
+
+ ret = do_soft_offline(fd, 2 * file_stat.f_bsize, expect_errno);
+
+ if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_after) != 0) {
+ close(fd);
+ ksft_exit_fail_msg("Failed to read nr_hugepages\n");
+ }
+
+ ksft_print_msg("After MADV_SOFT_OFFLINE nr_hugepages=%ld\n",
+ nr_hugepages_after);
+
+ // No need for the hugetlbfs file from now on.
+ close(fd);
+
+ if (enable_soft_offline) {
+ if (nr_hugepages_before != nr_hugepages_after + 1) {
+ ksft_test_result_fail("MADV_SOFT_OFFLINE should reduced 1 hugepage\n");
+ return;
+ }
+ } else {
+ if (nr_hugepages_before != nr_hugepages_after) {
+ ksft_test_result_fail("MADV_SOFT_OFFLINE reduced %lu hugepages\n",
+ nr_hugepages_before - nr_hugepages_after);
+ return;
+ }
+ }
+
+ ksft_test_result(ret == 0,
+ "Test soft-offline when enabled_soft_offline=%d\n",
+ enable_soft_offline);
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(2);
+
+ test_soft_offline_common(1);
+ test_soft_offline_common(0);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
new file mode 100644
index 000000000000..db63abe5ee5e
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program tests for hugepage leaks after DIO writes to a file using a
+ * hugepage as the user buffer. During DIO, the user buffer is pinned and
+ * should be properly unpinned upon completion. This patch verifies that the
+ * kernel correctly unpins the buffer at DIO completion for both aligned and
+ * unaligned user buffer offsets (w.r.t page boundary), ensuring the hugepage
+ * is freed upon unmapping.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
+{
+ int fd;
+ char *buffer = NULL;
+ char *orig_buffer = NULL;
+ size_t h_pagesize = 0;
+ size_t writesize;
+ int free_hpage_b = 0;
+ int free_hpage_a = 0;
+ const int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+ const int mmap_prot = PROT_READ | PROT_WRITE;
+
+ writesize = end_off - start_off;
+
+ /* Get the default huge page size */
+ h_pagesize = default_huge_page_size();
+ if (!h_pagesize)
+ ksft_exit_fail_msg("Unable to determine huge page size\n");
+
+ /* Open the file to DIO */
+ fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
+ if (fd < 0)
+ ksft_exit_fail_perror("Error opening file\n");
+
+ /* Get the free huge pages before allocation */
+ free_hpage_b = get_free_hugepages();
+ if (free_hpage_b == 0) {
+ close(fd);
+ ksft_exit_skip("No free hugepage, exiting!\n");
+ }
+
+ /* Allocate a hugetlb page */
+ orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
+ if (orig_buffer == MAP_FAILED) {
+ close(fd);
+ ksft_exit_fail_perror("Error mapping memory\n");
+ }
+ buffer = orig_buffer;
+ buffer += start_off;
+
+ memset(buffer, 'A', writesize);
+
+ /* Write the buffer to the file */
+ if (write(fd, buffer, writesize) != (writesize)) {
+ munmap(orig_buffer, h_pagesize);
+ close(fd);
+ ksft_exit_fail_perror("Error writing to file\n");
+ }
+
+ /* unmap the huge page */
+ munmap(orig_buffer, h_pagesize);
+ close(fd);
+
+ /* Get the free huge pages after unmap*/
+ free_hpage_a = get_free_hugepages();
+
+ ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b);
+ ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a);
+
+ /*
+ * If the no. of free hugepages before allocation and after unmap does
+ * not match - that means there could still be a page which is pinned.
+ */
+ ksft_test_result(free_hpage_a == free_hpage_b,
+ "free huge pages from %u-%u\n", start_off, end_off);
+}
+
+int main(void)
+{
+ size_t pagesize = 0;
+ int fd;
+
+ ksft_print_header();
+
+ /* Open the file to DIO */
+ fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
+ if (fd < 0)
+ ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno));
+ close(fd);
+
+ /* Check if huge pages are free */
+ if (!get_free_hugepages())
+ ksft_exit_skip("No free hugepage, exiting\n");
+
+ ksft_set_plan(4);
+
+ /* Get base page size */
+ pagesize = psize();
+
+ /* start and end is aligned to pagesize */
+ run_dio_using_hugetlb(0, (pagesize * 3));
+
+ /* start is aligned but end is not aligned */
+ run_dio_using_hugetlb(0, (pagesize * 3) - (pagesize / 2));
+
+ /* start is unaligned and end is aligned */
+ run_dio_using_hugetlb(pagesize / 2, (pagesize * 3));
+
+ /* both start and end are unaligned */
+ run_dio_using_hugetlb(pagesize / 2, (pagesize * 3) + (pagesize / 2));
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
new file mode 100644
index 000000000000..e2640529dbb2
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <setjmp.h>
+#include <signal.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define INLOOP_ITER 100
+
+static char *huge_ptr;
+static size_t huge_page_size;
+
+static sigjmp_buf sigbuf;
+static bool sigbus_triggered;
+
+static void signal_handler(int signal)
+{
+ if (signal == SIGBUS) {
+ sigbus_triggered = true;
+ siglongjmp(sigbuf, 1);
+ }
+}
+
+/* Touch the memory while it is being madvised() */
+void *touch(void *unused)
+{
+ char *ptr = (char *)huge_ptr;
+
+ if (sigsetjmp(sigbuf, 1))
+ return NULL;
+
+ for (int i = 0; i < INLOOP_ITER; i++)
+ ptr[0] = '.';
+
+ return NULL;
+}
+
+void *madv(void *unused)
+{
+ usleep(rand() % 10);
+
+ for (int i = 0; i < INLOOP_ITER; i++)
+ madvise(huge_ptr, huge_page_size, MADV_DONTNEED);
+
+ return NULL;
+}
+
+int main(void)
+{
+ unsigned long free_hugepages;
+ pthread_t thread1, thread2;
+ /*
+ * On kernel 6.4, we are able to reproduce the problem with ~1000
+ * interactions
+ */
+ int max = 10000;
+ int err;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ srand(getpid());
+
+ if (signal(SIGBUS, signal_handler) == SIG_ERR)
+ ksft_exit_skip("Could not register signal handler.");
+
+ huge_page_size = default_huge_page_size();
+ if (!huge_page_size)
+ ksft_exit_skip("Could not detect default hugetlb page size.");
+
+ ksft_print_msg("[INFO] detected default hugetlb page size: %zu KiB\n",
+ huge_page_size / 1024);
+
+ free_hugepages = get_free_hugepages();
+ if (free_hugepages != 1) {
+ ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
+ free_hugepages);
+ }
+
+ while (max--) {
+ huge_ptr = mmap(NULL, huge_page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+
+ if ((unsigned long)huge_ptr == -1)
+ ksft_exit_skip("Failed to allocated huge page\n");
+
+ pthread_create(&thread1, NULL, madv, NULL);
+ pthread_create(&thread2, NULL, touch, NULL);
+
+ pthread_join(thread1, NULL);
+ pthread_join(thread2, NULL);
+ munmap(huge_ptr, huge_page_size);
+ }
+
+ ksft_test_result(!sigbus_triggered, "SIGBUS behavior\n");
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
new file mode 100644
index 000000000000..8f122a0f0828
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case that must run on a system with one and only one huge page available.
+ * # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ *
+ * During setup, the test allocates the only available page, and starts three threads:
+ * - thread1:
+ * * madvise(MADV_DONTNEED) on the allocated huge page
+ * - thread 2:
+ * * Write to the allocated huge page
+ * - thread 3:
+ * * Try to allocated an extra huge page (which must not available)
+ *
+ * The test fails if thread3 is able to allocate a page.
+ *
+ * Touching the first page after thread3's allocation will raise a SIGBUS
+ *
+ * Author: Breno Leitao <leitao@debian.org>
+ */
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define INLOOP_ITER 100
+
+size_t mmap_size;
+char *huge_ptr;
+
+/* Touch the memory while it is being madvised() */
+void *touch(void *unused)
+{
+ for (int i = 0; i < INLOOP_ITER; i++)
+ huge_ptr[0] = '.';
+
+ return NULL;
+}
+
+void *madv(void *unused)
+{
+ for (int i = 0; i < INLOOP_ITER; i++)
+ madvise(huge_ptr, mmap_size, MADV_DONTNEED);
+
+ return NULL;
+}
+
+/*
+ * We got here, and there must be no huge page available for mapping
+ * The other hugepage should be flipping from used <-> reserved, because
+ * of madvise(DONTNEED).
+ */
+void *map_extra(void *unused)
+{
+ void *ptr;
+
+ for (int i = 0; i < INLOOP_ITER; i++) {
+ ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+
+ if ((long)ptr != -1) {
+ /* Touching the other page now will cause a SIGBUG
+ * huge_ptr[0] = '1';
+ */
+ return ptr;
+ }
+ }
+
+ return NULL;
+}
+
+int main(void)
+{
+ pthread_t thread1, thread2, thread3;
+ unsigned long free_hugepages;
+ void *ret;
+
+ /*
+ * On kernel 6.7, we are able to reproduce the problem with ~10
+ * interactions
+ */
+ int max = 10;
+
+ free_hugepages = get_free_hugepages();
+
+ if (free_hugepages != 1) {
+ ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n",
+ free_hugepages);
+ }
+
+ mmap_size = default_huge_page_size();
+
+ while (max--) {
+ huge_ptr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+
+ if ((unsigned long)huge_ptr == -1) {
+ ksft_test_result_fail("Failed to allocate huge page\n");
+ return KSFT_FAIL;
+ }
+
+ pthread_create(&thread1, NULL, madv, NULL);
+ pthread_create(&thread2, NULL, touch, NULL);
+ pthread_create(&thread3, NULL, map_extra, NULL);
+
+ pthread_join(thread1, NULL);
+ pthread_join(thread2, NULL);
+ pthread_join(thread3, &ret);
+
+ if (ret) {
+ ksft_test_result_fail("Unexpected huge page allocation\n");
+ return KSFT_FAIL;
+ }
+
+ /* Unmap and restart */
+ munmap(huge_ptr, mmap_size);
+ }
+
+ return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
new file mode 100755
index 000000000000..0dd31892ff67
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+ echo "This test must be run as root. Skipping..."
+ exit $ksft_skip
+fi
+
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+usage_file=usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+ cgroup2=1
+ usage_file=current
+fi
+
+
+if [[ $cgroup2 ]]; then
+ CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk '{print $3}')
+ if [[ -z "$CGROUP_ROOT" ]]; then
+ CGROUP_ROOT=$(mktemp -d)
+ mount -t cgroup2 none $CGROUP_ROOT
+ do_umount=1
+ fi
+ echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
+else
+ CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
+ if [[ -z "$CGROUP_ROOT" ]]; then
+ CGROUP_ROOT=/dev/cgroup/memory
+ mount -t cgroup memory,hugetlb $CGROUP_ROOT
+ do_umount=1
+ fi
+fi
+MNT='/mnt/huge'
+
+function get_machine_hugepage_size() {
+ hpz=$(grep -i hugepagesize /proc/meminfo)
+ kb=${hpz:14:-3}
+ mb=$(($kb / 1024))
+ echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function cleanup() {
+ echo cleanup
+ set +e
+ rm -rf "$MNT"/* 2>/dev/null
+ umount "$MNT" 2>/dev/null
+ rmdir "$MNT" 2>/dev/null
+ rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
+ rmdir "$CGROUP_ROOT"/a 2>/dev/null
+ rmdir "$CGROUP_ROOT"/test1 2>/dev/null
+ echo $nr_hugepgs >/proc/sys/vm/nr_hugepages
+ set -e
+}
+
+function assert_with_retry() {
+ local actual_path="$1"
+ local expected="$2"
+ local tolerance=$((7 * 1024 * 1024))
+ local timeout=20
+ local interval=1
+ local start_time
+ local now
+ local elapsed
+ local actual
+
+ start_time=$(date +%s)
+
+ while true; do
+ actual="$(cat "$actual_path")"
+
+ if [[ $actual -ge $(($expected - $tolerance)) ]] &&
+ [[ $actual -le $(($expected + $tolerance)) ]]; then
+ return 0
+ fi
+
+ now=$(date +%s)
+ elapsed=$((now - start_time))
+
+ if [[ $elapsed -ge $timeout ]]; then
+ echo "actual = $((${actual%% *} / 1024 / 1024)) MB"
+ echo "expected = $((${expected%% *} / 1024 / 1024)) MB"
+ cleanup
+ exit 1
+ fi
+
+ sleep $interval
+ done
+}
+
+function assert_state() {
+ local expected_a="$1"
+ local expected_a_hugetlb="$2"
+ local expected_b=""
+ local expected_b_hugetlb=""
+
+ if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
+ expected_b="$3"
+ expected_b_hugetlb="$4"
+ fi
+
+ assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a"
+ assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" "$expected_a_hugetlb"
+
+ if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then
+ assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b"
+ assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" "$expected_b_hugetlb"
+ fi
+}
+
+function setup() {
+ echo 100 >/proc/sys/vm/nr_hugepages
+ mkdir "$CGROUP_ROOT"/a
+ sleep 1
+ if [[ $cgroup2 ]]; then
+ echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control
+ else
+ echo 0 >$CGROUP_ROOT/a/cpuset.mems
+ echo 0 >$CGROUP_ROOT/a/cpuset.cpus
+ fi
+
+ mkdir "$CGROUP_ROOT"/a/b
+
+ if [[ ! $cgroup2 ]]; then
+ echo 0 >$CGROUP_ROOT/a/b/cpuset.mems
+ echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus
+ fi
+
+ mkdir -p "$MNT"
+ mount -t hugetlbfs none "$MNT"
+}
+
+write_hugetlbfs() {
+ local cgroup="$1"
+ local path="$2"
+ local size="$3"
+
+ if [[ $cgroup2 ]]; then
+ echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
+ else
+ echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
+ echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
+ echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
+ fi
+ ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
+ if [[ $cgroup2 ]]; then
+ echo $$ >$CGROUP_ROOT/cgroup.procs
+ else
+ echo $$ >"$CGROUP_ROOT/tasks"
+ fi
+ echo
+}
+
+set -e
+
+size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
+
+cleanup
+
+echo
+echo Test charge, rmdir, uncharge
+setup
+echo mkdir
+mkdir $CGROUP_ROOT/test1
+
+echo write
+write_hugetlbfs test1 "$MNT"/test $size
+
+echo rmdir
+rmdir $CGROUP_ROOT/test1
+mkdir $CGROUP_ROOT/test1
+
+echo uncharge
+rm -rf /mnt/huge/*
+
+cleanup
+
+echo done
+echo
+if [[ ! $cgroup2 ]]; then
+ echo "Test parent and child hugetlb usage"
+ setup
+
+ echo write
+ write_hugetlbfs a "$MNT"/test $size
+
+ echo Assert memory charged correctly for parent use.
+ assert_state 0 $size 0 0
+
+ write_hugetlbfs a/b "$MNT"/test2 $size
+
+ echo Assert memory charged correctly for child use.
+ assert_state 0 $(($size * 2)) 0 $size
+
+ rmdir "$CGROUP_ROOT"/a/b
+ echo Assert memory reparent correctly.
+ assert_state 0 $(($size * 2))
+
+ rm -rf "$MNT"/*
+ umount "$MNT"
+ echo Assert memory uncharged correctly.
+ assert_state 0 0
+
+ cleanup
+fi
+
+echo
+echo "Test child only hugetlb usage"
+echo setup
+setup
+
+echo write
+write_hugetlbfs a/b "$MNT"/test2 $size
+
+echo Assert memory charged correctly for child only use.
+assert_state 0 $(($size)) 0 $size
+
+rmdir "$CGROUP_ROOT"/a/b
+echo Assert memory reparent correctly.
+assert_state 0 $size
+
+rm -rf "$MNT"/*
+umount "$MNT"
+echo Assert memory uncharged correctly.
+assert_state 0 0
+
+cleanup
+
+echo ALL PASS
+
+if [[ $do_umount ]]; then
+ umount $CGROUP_ROOT
+ rm -rf $CGROUP_ROOT
+fi
+
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
new file mode 100644
index 000000000000..8a4d34cce36b
--- /dev/null
+++ b/tools/testing/selftests/mm/khugepaged.c
@@ -0,0 +1,1287 @@
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <dirent.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+
+#include "linux/magic.h"
+
+#include "vm_util.h"
+#include "thp_settings.h"
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+static int anon_order;
+
+#define PID_SMAPS "/proc/self/smaps"
+#define TEST_FILE "collapse_test_file"
+
+#define MAX_LINE_LENGTH 500
+
+enum vma_type {
+ VMA_ANON,
+ VMA_FILE,
+ VMA_SHMEM,
+};
+
+struct mem_ops {
+ void *(*setup_area)(int nr_hpages);
+ void (*cleanup_area)(void *p, unsigned long size);
+ void (*fault)(void *p, unsigned long start, unsigned long end);
+ bool (*check_huge)(void *addr, int nr_hpages);
+ const char *name;
+};
+
+static struct mem_ops *file_ops;
+static struct mem_ops *anon_ops;
+static struct mem_ops *shmem_ops;
+
+struct collapse_context {
+ void (*collapse)(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect);
+ bool enforce_pte_scan_limits;
+ const char *name;
+};
+
+static struct collapse_context *khugepaged_context;
+static struct collapse_context *madvise_context;
+
+struct file_info {
+ const char *dir;
+ char path[PATH_MAX];
+ enum vma_type type;
+ int fd;
+ char dev_queue_read_ahead_path[PATH_MAX];
+};
+
+static struct file_info finfo;
+static bool skip_settings_restore;
+static int exit_status;
+
+static void success(const char *msg)
+{
+ printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+ printf(" \e[31m%s\e[0m\n", msg);
+ exit_status++;
+}
+
+static void skip(const char *msg)
+{
+ printf(" \e[33m%s\e[0m\n", msg);
+}
+
+static void restore_settings_atexit(void)
+{
+ if (skip_settings_restore)
+ return;
+
+ printf("Restore THP and khugepaged settings...");
+ thp_restore_settings();
+ success("OK");
+
+ skip_settings_restore = true;
+}
+
+static void restore_settings(int sig)
+{
+ /* exit() will invoke the restore_settings_atexit handler. */
+ exit(sig ? EXIT_FAILURE : exit_status);
+}
+
+static void save_settings(void)
+{
+ printf("Save THP and khugepaged settings...");
+ if (file_ops && finfo.type == VMA_FILE)
+ thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
+ thp_save_settings();
+
+ success("OK");
+
+ atexit(restore_settings_atexit);
+ signal(SIGTERM, restore_settings);
+ signal(SIGINT, restore_settings);
+ signal(SIGHUP, restore_settings);
+ signal(SIGQUIT, restore_settings);
+}
+
+static void get_finfo(const char *dir)
+{
+ struct stat path_stat;
+ struct statfs fs;
+ char buf[1 << 10];
+ char path[PATH_MAX];
+ char *str, *end;
+
+ finfo.dir = dir;
+ stat(finfo.dir, &path_stat);
+ if (!S_ISDIR(path_stat.st_mode)) {
+ printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
+ exit(EXIT_FAILURE);
+ }
+ if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
+ finfo.dir) >= sizeof(finfo.path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ if (statfs(finfo.dir, &fs)) {
+ perror("statfs()");
+ exit(EXIT_FAILURE);
+ }
+ finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
+ if (finfo.type == VMA_SHMEM)
+ return;
+
+ /* Find owning device's queue/read_ahead_kb control */
+ if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
+ major(path_stat.st_dev), minor(path_stat.st_dev))
+ >= sizeof(path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ if (read_file(path, buf, sizeof(buf)) < 0) {
+ perror("read_file(read_num)");
+ exit(EXIT_FAILURE);
+ }
+ if (strstr(buf, "DEVTYPE=disk")) {
+ /* Found it */
+ if (snprintf(finfo.dev_queue_read_ahead_path,
+ sizeof(finfo.dev_queue_read_ahead_path),
+ "/sys/dev/block/%d:%d/queue/read_ahead_kb",
+ major(path_stat.st_dev), minor(path_stat.st_dev))
+ >= sizeof(finfo.dev_queue_read_ahead_path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ return;
+ }
+ if (!strstr(buf, "DEVTYPE=partition")) {
+ printf("%s: Unknown device type: %s\n", __func__, path);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Partition of block device - need to find actual device.
+ * Using naming convention that devnameN is partition of
+ * device devname.
+ */
+ str = strstr(buf, "DEVNAME=");
+ if (!str) {
+ printf("%s: Could not read: %s", __func__, path);
+ exit(EXIT_FAILURE);
+ }
+ str += 8;
+ end = str;
+ while (*end) {
+ if (isdigit(*end)) {
+ *end = '\0';
+ if (snprintf(finfo.dev_queue_read_ahead_path,
+ sizeof(finfo.dev_queue_read_ahead_path),
+ "/sys/block/%s/queue/read_ahead_kb",
+ str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ return;
+ }
+ ++end;
+ }
+ printf("%s: Could not read: %s\n", __func__, path);
+ exit(EXIT_FAILURE);
+}
+
+static bool check_swap(void *addr, unsigned long size)
+{
+ bool swap = false;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+
+ fp = fopen(PID_SMAPS, "r");
+ if (!fp) {
+ printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+ exit(EXIT_FAILURE);
+ }
+ if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+ goto err_out;
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
+ size >> 10);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Fetch the Swap: in the same block and check whether it got
+ * the expected number of hugeepages next.
+ */
+ if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
+ goto err_out;
+
+ if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+ goto err_out;
+
+ swap = true;
+err_out:
+ fclose(fp);
+ return swap;
+}
+
+static void *alloc_mapping(int nr)
+{
+ void *p;
+
+ p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p != BASE_ADDR) {
+ printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+ exit(EXIT_FAILURE);
+ }
+
+ return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++)
+ p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+/*
+ * MADV_COLLAPSE is a best-effort request and may fail if an internal
+ * resource is temporarily unavailable, in which case it will set errno to
+ * EAGAIN. In such a case, immediately reattempt the operation one more
+ * time.
+ */
+static int madvise_collapse_retry(void *p, unsigned long size)
+{
+ bool retry = true;
+ int ret;
+
+retry:
+ ret = madvise(p, size, MADV_COLLAPSE);
+ if (ret && errno == EAGAIN && retry) {
+ retry = false;
+ goto retry;
+ }
+ return ret;
+}
+
+/*
+ * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
+ * validate_memory()'able contents.
+ */
+static void *alloc_hpage(struct mem_ops *ops)
+{
+ void *p = ops->setup_area(1);
+
+ ops->fault(p, 0, hpage_pmd_size);
+
+ /*
+ * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
+ * The latter is ineligible for collapse by MADV_COLLAPSE
+ * while the former might cause MADV_COLLAPSE to race with
+ * khugepaged on low-load system (like a test machine), which
+ * would cause MADV_COLLAPSE to fail with EAGAIN.
+ */
+ printf("Allocate huge page...");
+ if (madvise_collapse_retry(p, hpage_pmd_size)) {
+ perror("madvise(MADV_COLLAPSE)");
+ exit(EXIT_FAILURE);
+ }
+ if (!ops->check_huge(p, 1)) {
+ perror("madvise(MADV_COLLAPSE)");
+ exit(EXIT_FAILURE);
+ }
+ if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
+ perror("madvise(MADV_HUGEPAGE)");
+ exit(EXIT_FAILURE);
+ }
+ success("OK");
+ return p;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++) {
+ if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+ printf("Page %d is corrupted: %#x\n",
+ i, p[i * page_size / sizeof(*p)]);
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+static void *anon_setup_area(int nr_hpages)
+{
+ return alloc_mapping(nr_hpages);
+}
+
+static void anon_cleanup_area(void *p, unsigned long size)
+{
+ munmap(p, size);
+}
+
+static void anon_fault(void *p, unsigned long start, unsigned long end)
+{
+ fill_memory(p, start, end);
+}
+
+static bool anon_check_huge(void *addr, int nr_hpages)
+{
+ return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
+}
+
+static void *file_setup_area(int nr_hpages)
+{
+ int fd;
+ void *p;
+ unsigned long size;
+
+ unlink(finfo.path); /* Cleanup from previous failed tests */
+ printf("Creating %s for collapse%s...", finfo.path,
+ finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
+ fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
+ 777);
+ if (fd < 0) {
+ perror("open()");
+ exit(EXIT_FAILURE);
+ }
+
+ size = nr_hpages * hpage_pmd_size;
+ p = alloc_mapping(nr_hpages);
+ fill_memory(p, 0, size);
+ write(fd, p, size);
+ close(fd);
+ munmap(p, size);
+ success("OK");
+
+ printf("Opening %s read only for collapse...", finfo.path);
+ finfo.fd = open(finfo.path, O_RDONLY, 777);
+ if (finfo.fd < 0) {
+ perror("open()");
+ exit(EXIT_FAILURE);
+ }
+ p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
+ MAP_PRIVATE, finfo.fd, 0);
+ if (p == MAP_FAILED || p != BASE_ADDR) {
+ perror("mmap()");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Drop page cache */
+ write_file("/proc/sys/vm/drop_caches", "3", 2);
+ success("OK");
+ return p;
+}
+
+static void file_cleanup_area(void *p, unsigned long size)
+{
+ munmap(p, size);
+ close(finfo.fd);
+ unlink(finfo.path);
+}
+
+static void file_fault(void *p, unsigned long start, unsigned long end)
+{
+ if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
+ perror("madvise(MADV_POPULATE_READ");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static bool file_check_huge(void *addr, int nr_hpages)
+{
+ switch (finfo.type) {
+ case VMA_FILE:
+ return check_huge_file(addr, nr_hpages, hpage_pmd_size);
+ case VMA_SHMEM:
+ return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+ default:
+ exit(EXIT_FAILURE);
+ return false;
+ }
+}
+
+static void *shmem_setup_area(int nr_hpages)
+{
+ void *p;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
+ if (finfo.fd < 0) {
+ perror("memfd_create()");
+ exit(EXIT_FAILURE);
+ }
+ if (ftruncate(finfo.fd, size)) {
+ perror("ftruncate()");
+ exit(EXIT_FAILURE);
+ }
+ p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
+ 0);
+ if (p != BASE_ADDR) {
+ perror("mmap()");
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+static void shmem_cleanup_area(void *p, unsigned long size)
+{
+ munmap(p, size);
+ close(finfo.fd);
+}
+
+static bool shmem_check_huge(void *addr, int nr_hpages)
+{
+ return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+}
+
+static struct mem_ops __anon_ops = {
+ .setup_area = &anon_setup_area,
+ .cleanup_area = &anon_cleanup_area,
+ .fault = &anon_fault,
+ .check_huge = &anon_check_huge,
+ .name = "anon",
+};
+
+static struct mem_ops __file_ops = {
+ .setup_area = &file_setup_area,
+ .cleanup_area = &file_cleanup_area,
+ .fault = &file_fault,
+ .check_huge = &file_check_huge,
+ .name = "file",
+};
+
+static struct mem_ops __shmem_ops = {
+ .setup_area = &shmem_setup_area,
+ .cleanup_area = &shmem_cleanup_area,
+ .fault = &anon_fault,
+ .check_huge = &shmem_check_huge,
+ .name = "shmem",
+};
+
+static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect)
+{
+ int ret;
+ struct thp_settings settings = *thp_current_settings();
+
+ printf("%s...", msg);
+
+ /*
+ * Prevent khugepaged interference and tests that MADV_COLLAPSE
+ * ignores /sys/kernel/mm/transparent_hugepage/enabled
+ */
+ settings.thp_enabled = THP_NEVER;
+ settings.shmem_enabled = SHMEM_NEVER;
+ thp_push_settings(&settings);
+
+ /* Clear VM_NOHUGEPAGE */
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+ ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
+ if (((bool)ret) == expect)
+ fail("Fail: Bad return value");
+ else if (!ops->check_huge(p, expect ? nr_hpages : 0))
+ fail("Fail: check_huge()");
+ else
+ success("OK");
+
+ thp_pop_settings();
+}
+
+static void madvise_collapse(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect)
+{
+ /* Sanity check */
+ if (!ops->check_huge(p, 0)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+ __madvise_collapse(msg, p, nr_hpages, ops, expect);
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops)
+{
+ int full_scans;
+ int timeout = 6; /* 3 seconds */
+
+ /* Sanity check */
+ if (!ops->check_huge(p, 0)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+
+ /* Wait until the second full_scan completed */
+ full_scans = thp_read_num("khugepaged/full_scans") + 2;
+
+ printf("%s...", msg);
+ while (timeout--) {
+ if (ops->check_huge(p, nr_hpages))
+ break;
+ if (thp_read_num("khugepaged/full_scans") >= full_scans)
+ break;
+ printf(".");
+ usleep(TICK);
+ }
+
+ madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ return timeout == -1;
+}
+
+static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
+ struct mem_ops *ops, bool expect)
+{
+ if (wait_for_scan(msg, p, nr_hpages, ops)) {
+ if (expect)
+ fail("Timeout");
+ else
+ success("OK");
+ return;
+ }
+
+ /*
+ * For file and shmem memory, khugepaged only retracts pte entries after
+ * putting the new hugepage in the page cache. The hugepage must be
+ * subsequently refaulted to install the pmd mapping for the mm.
+ */
+ if (ops != &__anon_ops)
+ ops->fault(p, 0, nr_hpages * hpage_pmd_size);
+
+ if (ops->check_huge(p, expect ? nr_hpages : 0))
+ success("OK");
+ else
+ fail("Fail");
+}
+
+static struct collapse_context __khugepaged_context = {
+ .collapse = &khugepaged_collapse,
+ .enforce_pte_scan_limits = true,
+ .name = "khugepaged",
+};
+
+static struct collapse_context __madvise_context = {
+ .collapse = &madvise_collapse,
+ .enforce_pte_scan_limits = false,
+ .name = "madvise",
+};
+
+static bool is_tmpfs(struct mem_ops *ops)
+{
+ return ops == &__file_ops && finfo.type == VMA_SHMEM;
+}
+
+static bool is_anon(struct mem_ops *ops)
+{
+ return ops == &__anon_ops;
+}
+
+static void alloc_at_fault(void)
+{
+ struct thp_settings settings = *thp_current_settings();
+ char *p;
+
+ settings.thp_enabled = THP_ALWAYS;
+ thp_push_settings(&settings);
+
+ p = alloc_mapping(1);
+ *p = 1;
+ printf("Allocate huge page on fault...");
+ if (check_huge_anon(p, 1, hpage_pmd_size))
+ success("OK");
+ else
+ fail("Fail");
+
+ thp_pop_settings();
+
+ madvise(p, page_size, MADV_DONTNEED);
+ printf("Split huge PMD on MADV_DONTNEED...");
+ if (check_huge_anon(p, 0, hpage_pmd_size))
+ success("OK");
+ else
+ fail("Fail");
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+ int nr_hpages = 4;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ p = ops->setup_area(nr_hpages);
+ ops->fault(p, 0, size);
+ c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
+ ops, true);
+ validate_memory(p, 0, size);
+ ops->cleanup_area(p, size);
+}
+
+static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, page_size);
+ c->collapse("Collapse PTE table with single PTE entry present", p,
+ 1, ops, true);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
+{
+ int max_ptes_none = hpage_pmd_nr / 2;
+ struct thp_settings settings = *thp_current_settings();
+ void *p;
+ int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;
+
+ settings.khugepaged.max_ptes_none = max_ptes_none;
+ thp_push_settings(&settings);
+
+ p = ops->setup_area(1);
+
+ if (is_tmpfs(ops)) {
+ /* shmem pages always in the page cache */
+ printf("tmpfs...");
+ skip("Skip");
+ goto skip;
+ }
+
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
+ c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
+ ops, !c->enforce_pte_scan_limits);
+ validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
+
+ if (c->enforce_pte_scan_limits) {
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+ c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
+ true);
+ validate_memory(p, 0,
+ (hpage_pmd_nr - max_ptes_none) * page_size);
+ }
+skip:
+ ops->cleanup_area(p, hpage_pmd_size);
+ thp_pop_settings();
+}
+
+static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, hpage_pmd_size);
+
+ printf("Swapout one page...");
+ if (madvise(p, page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
+ true);
+ validate_memory(p, 0, hpage_pmd_size);
+out:
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
+{
+ int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, hpage_pmd_size);
+
+ printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+ if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
+ !c->enforce_pte_scan_limits);
+ validate_memory(p, 0, hpage_pmd_size);
+
+ if (c->enforce_pte_scan_limits) {
+ ops->fault(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap,
+ hpage_pmd_nr);
+ if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, max_ptes_swap * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ c->collapse("Collapse with max_ptes_swap pages swapped out", p,
+ 1, ops, true);
+ validate_memory(p, 0, hpage_pmd_size);
+ }
+out:
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = alloc_hpage(ops);
+
+ if (is_tmpfs(ops)) {
+ /* MADV_DONTNEED won't evict tmpfs pages */
+ printf("tmpfs...");
+ skip("Skip");
+ goto skip;
+ }
+
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ printf("Split huge page leaving single PTE mapping compound page...");
+ madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse PTE table with single PTE mapping compound page",
+ p, 1, ops, true);
+ validate_memory(p, 0, page_size);
+skip:
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+
+ p = alloc_hpage(ops);
+ printf("Split huge page leaving single PTE page table full of compound pages...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
+ true);
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
+{
+ void *p;
+ int i;
+
+ p = ops->setup_area(1);
+ for (i = 0; i < hpage_pmd_nr; i++) {
+ printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+ i + 1, hpage_pmd_nr);
+
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+ ops->fault(BASE_ADDR, 0, hpage_pmd_size);
+ if (!ops->check_huge(BASE_ADDR, 1)) {
+ printf("Failed to allocate huge page\n");
+ exit(EXIT_FAILURE);
+ }
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ p = mremap(BASE_ADDR - i * page_size,
+ i * page_size + hpage_pmd_size,
+ (i + 1) * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR + 2 * hpage_pmd_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+unmap");
+ exit(EXIT_FAILURE);
+ }
+
+ p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+ (i + 1) * page_size,
+ (i + 1) * page_size + hpage_pmd_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR - (i + 1) * page_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+alloc");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
+ ops->fault(p, 0, hpage_pmd_size);
+ if (!ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse PTE table full of different compound pages", p, 1,
+ ops, true);
+
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
+{
+ int wstatus;
+ void *p;
+
+ p = ops->setup_area(1);
+
+ printf("Allocate small page...");
+ ops->fault(p, 0, page_size);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share small page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ ops->fault(p, page_size, 2 * page_size);
+ c->collapse("Collapse PTE table with single page shared with parent process",
+ p, 1, ops, true);
+
+ validate_memory(p, 0, page_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has small page...");
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+ int wstatus;
+ void *p;
+
+ p = alloc_hpage(ops);
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Split huge page PMD in child process...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+ ops->fault(p, 0, page_size);
+
+ thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
+ c->collapse("Collapse PTE table full of compound pages in child",
+ p, 1, ops, true);
+ thp_write_num("khugepaged/max_ptes_shared",
+ thp_current_settings()->khugepaged.max_ptes_shared);
+
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
+{
+ int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
+ int wstatus;
+ void *p;
+
+ p = alloc_hpage(ops);
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
+ 1, ops, !c->enforce_pte_scan_limits);
+
+ if (c->enforce_pte_scan_limits) {
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+ ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
+ page_size);
+ if (ops->check_huge(p, 0))
+ success("OK");
+ else
+ fail("Fail");
+
+ c->collapse("Collapse with max_ptes_shared PTEs shared",
+ p, 1, ops, true);
+ }
+
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (ops->check_huge(p, 1))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void madvise_collapse_existing_thps(struct collapse_context *c,
+ struct mem_ops *ops)
+{
+ void *p;
+
+ p = ops->setup_area(1);
+ ops->fault(p, 0, hpage_pmd_size);
+ c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
+ validate_memory(p, 0, hpage_pmd_size);
+
+ /* c->collapse() will find a hugepage and complain - call directly. */
+ __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
+ validate_memory(p, 0, hpage_pmd_size);
+ ops->cleanup_area(p, hpage_pmd_size);
+}
+
+/*
+ * Test race with khugepaged where page tables have been retracted and
+ * pmd cleared.
+ */
+static void madvise_retracted_page_tables(struct collapse_context *c,
+ struct mem_ops *ops)
+{
+ void *p;
+ int nr_hpages = 1;
+ unsigned long size = nr_hpages * hpage_pmd_size;
+
+ p = ops->setup_area(nr_hpages);
+ ops->fault(p, 0, size);
+
+ /* Let khugepaged collapse and leave pmd cleared */
+ if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
+ ops)) {
+ fail("Timeout");
+ return;
+ }
+ success("OK");
+ c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
+ true);
+ validate_memory(p, 0, size);
+ ops->cleanup_area(p, size);
+}
+
+static void usage(void)
+{
+ fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
+ fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
+ fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
+ fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
+ fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
+ fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
+ fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
+ fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
+ fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n");
+ fprintf(stderr, "\n\tSupported Options:\n");
+ fprintf(stderr, "\t\t-h: This help message.\n");
+ fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n");
+ fprintf(stderr, "\t\t Defaults to 0. Use this size for anon or shmem allocations.\n");
+ exit(1);
+}
+
+static void parse_test_type(int argc, char **argv)
+{
+ int opt;
+ char *buf;
+ const char *token;
+
+ while ((opt = getopt(argc, argv, "s:h")) != -1) {
+ switch (opt) {
+ case 's':
+ anon_order = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+
+ argv += optind;
+ argc -= optind;
+
+ if (argc == 0) {
+ /* Backwards compatibility */
+ khugepaged_context = &__khugepaged_context;
+ madvise_context = &__madvise_context;
+ anon_ops = &__anon_ops;
+ return;
+ }
+
+ buf = strdup(argv[0]);
+ token = strsep(&buf, ":");
+
+ if (!strcmp(token, "all")) {
+ khugepaged_context = &__khugepaged_context;
+ madvise_context = &__madvise_context;
+ } else if (!strcmp(token, "khugepaged")) {
+ khugepaged_context = &__khugepaged_context;
+ } else if (!strcmp(token, "madvise")) {
+ madvise_context = &__madvise_context;
+ } else {
+ usage();
+ }
+
+ if (!buf)
+ usage();
+
+ if (!strcmp(buf, "all")) {
+ file_ops = &__file_ops;
+ anon_ops = &__anon_ops;
+ shmem_ops = &__shmem_ops;
+ } else if (!strcmp(buf, "anon")) {
+ anon_ops = &__anon_ops;
+ } else if (!strcmp(buf, "file")) {
+ file_ops = &__file_ops;
+ } else if (!strcmp(buf, "shmem")) {
+ shmem_ops = &__shmem_ops;
+ } else {
+ usage();
+ }
+
+ if (!file_ops)
+ return;
+
+ if (argc != 2)
+ usage();
+
+ get_finfo(argv[1]);
+}
+
+int main(int argc, char **argv)
+{
+ int hpage_pmd_order;
+ struct thp_settings default_settings = {
+ .thp_enabled = THP_MADVISE,
+ .thp_defrag = THP_DEFRAG_ALWAYS,
+ .shmem_enabled = SHMEM_ADVISE,
+ .use_zero_page = 0,
+ .khugepaged = {
+ .defrag = 1,
+ .alloc_sleep_millisecs = 10,
+ .scan_sleep_millisecs = 10,
+ },
+ /*
+ * When testing file-backed memory, the collapse path
+ * looks at how many pages are found in the page cache, not
+ * what pages are mapped. Disable read ahead optimization so
+ * pages don't find their way into the page cache unless
+ * we mem_ops->fault() them in.
+ */
+ .read_ahead_kb = 0,
+ };
+
+ parse_test_type(argc, argv);
+
+ setbuf(stdout, NULL);
+
+ page_size = getpagesize();
+ hpage_pmd_size = read_pmd_pagesize();
+ if (!hpage_pmd_size) {
+ printf("Reading PMD pagesize failed");
+ exit(EXIT_FAILURE);
+ }
+ hpage_pmd_nr = hpage_pmd_size / page_size;
+ hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
+
+ default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+ default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+ default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
+ default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+ default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
+ default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
+ default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
+ default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;
+
+ save_settings();
+ thp_push_settings(&default_settings);
+
+ alloc_at_fault();
+
+#define TEST(t, c, o) do { \
+ if (c && o) { \
+ printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
+ t(c, o); \
+ } \
+ } while (0)
+
+ TEST(collapse_full, khugepaged_context, anon_ops);
+ TEST(collapse_full, khugepaged_context, file_ops);
+ TEST(collapse_full, khugepaged_context, shmem_ops);
+ TEST(collapse_full, madvise_context, anon_ops);
+ TEST(collapse_full, madvise_context, file_ops);
+ TEST(collapse_full, madvise_context, shmem_ops);
+
+ TEST(collapse_empty, khugepaged_context, anon_ops);
+ TEST(collapse_empty, madvise_context, anon_ops);
+
+ TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
+ TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
+ TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
+ TEST(collapse_single_pte_entry, madvise_context, anon_ops);
+ TEST(collapse_single_pte_entry, madvise_context, file_ops);
+ TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
+
+ TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
+ TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
+ TEST(collapse_max_ptes_none, madvise_context, anon_ops);
+ TEST(collapse_max_ptes_none, madvise_context, file_ops);
+
+ TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
+ TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
+ TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
+ TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
+
+ TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
+ TEST(collapse_full_of_compound, khugepaged_context, file_ops);
+ TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
+ TEST(collapse_full_of_compound, madvise_context, anon_ops);
+ TEST(collapse_full_of_compound, madvise_context, file_ops);
+ TEST(collapse_full_of_compound, madvise_context, shmem_ops);
+
+ TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
+ TEST(collapse_compound_extreme, madvise_context, anon_ops);
+
+ TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
+ TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
+
+ TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
+ TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
+
+ TEST(collapse_fork, khugepaged_context, anon_ops);
+ TEST(collapse_fork, madvise_context, anon_ops);
+
+ TEST(collapse_fork_compound, khugepaged_context, anon_ops);
+ TEST(collapse_fork_compound, madvise_context, anon_ops);
+
+ TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
+ TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
+
+ TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
+ TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
+ TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
+
+ TEST(madvise_retracted_page_tables, madvise_context, file_ops);
+ TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
+
+ restore_settings(0);
+}
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
new file mode 100644
index 000000000000..b61803e36d1c
--- /dev/null
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -0,0 +1,719 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KSM functional tests
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <linux/userfaultfd.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define KiB 1024u
+#define MiB (1024 * KiB)
+#define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child"
+
+#define MAP_MERGE_FAIL ((void *)-1)
+#define MAP_MERGE_SKIP ((void *)-2)
+
+enum ksm_merge_mode {
+ KSM_MERGE_PRCTL,
+ KSM_MERGE_MADVISE,
+ KSM_MERGE_NONE, /* PRCTL already set */
+};
+
+static int mem_fd;
+static int ksm_fd;
+static int ksm_full_scans_fd;
+static int proc_self_ksm_stat_fd;
+static int proc_self_ksm_merging_pages_fd;
+static int ksm_use_zero_pages_fd;
+static int pagemap_fd;
+static size_t pagesize;
+
+static bool range_maps_duplicates(char *addr, unsigned long size)
+{
+ unsigned long offs_a, offs_b, pfn_a, pfn_b;
+
+ /*
+ * There is no easy way to check if there are KSM pages mapped into
+ * this range. We only check that the range does not map the same PFN
+ * twice by comparing each pair of mapped pages.
+ */
+ for (offs_a = 0; offs_a < size; offs_a += pagesize) {
+ pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
+ /* Page not present or PFN not exposed by the kernel. */
+ if (pfn_a == -1ul || !pfn_a)
+ continue;
+
+ for (offs_b = offs_a + pagesize; offs_b < size;
+ offs_b += pagesize) {
+ pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
+ if (pfn_b == -1ul || !pfn_b)
+ continue;
+ if (pfn_a == pfn_b)
+ return true;
+ }
+ }
+ return false;
+}
+
+static long get_my_ksm_zero_pages(void)
+{
+ char buf[200];
+ char *substr_ksm_zero;
+ size_t value_pos;
+ ssize_t read_size;
+ unsigned long my_ksm_zero_pages;
+
+ if (!proc_self_ksm_stat_fd)
+ return 0;
+
+ read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0);
+ if (read_size < 0)
+ return -errno;
+
+ buf[read_size] = 0;
+
+ substr_ksm_zero = strstr(buf, "ksm_zero_pages");
+ if (!substr_ksm_zero)
+ return 0;
+
+ value_pos = strcspn(substr_ksm_zero, "0123456789");
+ my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10);
+
+ return my_ksm_zero_pages;
+}
+
+static long get_my_merging_pages(void)
+{
+ char buf[10];
+ ssize_t ret;
+
+ if (proc_self_ksm_merging_pages_fd < 0)
+ return proc_self_ksm_merging_pages_fd;
+
+ ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0);
+ if (ret <= 0)
+ return -errno;
+ buf[ret] = 0;
+
+ return strtol(buf, NULL, 10);
+}
+
+static long ksm_get_full_scans(void)
+{
+ char buf[10];
+ ssize_t ret;
+
+ ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
+ if (ret <= 0)
+ return -errno;
+ buf[ret] = 0;
+
+ return strtol(buf, NULL, 10);
+}
+
+static int ksm_merge(void)
+{
+ long start_scans, end_scans;
+
+ /* Wait for two full scans such that any possible merging happened. */
+ start_scans = ksm_get_full_scans();
+ if (start_scans < 0)
+ return start_scans;
+ if (write(ksm_fd, "1", 1) != 1)
+ return -errno;
+ do {
+ end_scans = ksm_get_full_scans();
+ if (end_scans < 0)
+ return end_scans;
+ } while (end_scans < start_scans + 2);
+
+ return 0;
+}
+
+static int ksm_unmerge(void)
+{
+ if (write(ksm_fd, "2", 1) != 1)
+ return -errno;
+ return 0;
+}
+
+static char *__mmap_and_merge_range(char val, unsigned long size, int prot,
+ enum ksm_merge_mode mode)
+{
+ char *map;
+ char *err_map = MAP_MERGE_FAIL;
+ int ret;
+
+ /* Stabilize accounting by disabling KSM completely. */
+ if (ksm_unmerge()) {
+ ksft_print_msg("Disabling (unmerging) KSM failed\n");
+ return err_map;
+ }
+
+ if (get_my_merging_pages() > 0) {
+ ksft_print_msg("Still pages merged\n");
+ return err_map;
+ }
+
+ map = mmap(NULL, size, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (map == MAP_FAILED) {
+ ksft_print_msg("mmap() failed\n");
+ return err_map;
+ }
+
+ /* Don't use THP. Ignore if THP are not around on a kernel. */
+ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
+ ksft_print_msg("MADV_NOHUGEPAGE failed\n");
+ goto unmap;
+ }
+
+ /* Make sure each page contains the same values to merge them. */
+ memset(map, val, size);
+
+ if (mprotect(map, size, prot)) {
+ ksft_print_msg("mprotect() failed\n");
+ err_map = MAP_MERGE_SKIP;
+ goto unmap;
+ }
+
+ switch (mode) {
+ case KSM_MERGE_PRCTL:
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n");
+ err_map = MAP_MERGE_SKIP;
+ goto unmap;
+ } else if (ret) {
+ ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n");
+ goto unmap;
+ }
+ break;
+ case KSM_MERGE_MADVISE:
+ if (madvise(map, size, MADV_MERGEABLE)) {
+ ksft_print_msg("MADV_MERGEABLE failed\n");
+ goto unmap;
+ }
+ break;
+ case KSM_MERGE_NONE:
+ break;
+ }
+
+ /* Run KSM to trigger merging and wait. */
+ if (ksm_merge()) {
+ ksft_print_msg("Running KSM failed\n");
+ goto unmap;
+ }
+
+ /*
+ * Check if anything was merged at all. Ignore the zero page that is
+ * accounted differently (depending on kernel support).
+ */
+ if (val && !get_my_merging_pages()) {
+ ksft_print_msg("No pages got merged\n");
+ goto unmap;
+ }
+
+ return map;
+unmap:
+ munmap(map, size);
+ return err_map;
+}
+
+static char *mmap_and_merge_range(char val, unsigned long size, int prot,
+ enum ksm_merge_mode mode)
+{
+ char *map;
+ char *ret = MAP_FAILED;
+
+ map = __mmap_and_merge_range(val, size, prot, mode);
+ if (map == MAP_MERGE_FAIL)
+ ksft_test_result_fail("Merging memory failed");
+ else if (map == MAP_MERGE_SKIP)
+ ksft_test_result_skip("Merging memory skipped");
+ else
+ ret = map;
+
+ return ret;
+}
+
+static void test_unmerge(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
+ if (map == MAP_FAILED)
+ return;
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+static void test_unmerge_zero_pages(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+ unsigned int offs;
+ unsigned long pages_expected;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ if (proc_self_ksm_stat_fd < 0) {
+ ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n");
+ return;
+ }
+ if (ksm_use_zero_pages_fd < 0) {
+ ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+ return;
+ }
+ if (write(ksm_use_zero_pages_fd, "1", 1) != 1) {
+ ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n");
+ return;
+ }
+
+ /* Let KSM deduplicate zero pages. */
+ map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
+ if (map == MAP_FAILED)
+ return;
+
+ /* Check if ksm_zero_pages is updated correctly after KSM merging */
+ pages_expected = size / pagesize;
+ if (pages_expected != get_my_ksm_zero_pages()) {
+ ksft_test_result_fail("'ksm_zero_pages' updated after merging\n");
+ goto unmap;
+ }
+
+ /* Try to unmerge half of the region */
+ if (madvise(map, size / 2, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ /* Check if ksm_zero_pages is updated correctly after unmerging */
+ pages_expected /= 2;
+ if (pages_expected != get_my_ksm_zero_pages()) {
+ ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n");
+ goto unmap;
+ }
+
+ /* Trigger unmerging of the other half by writing to the pages. */
+ for (offs = size / 2; offs < size; offs += pagesize)
+ *((unsigned int *)&map[offs]) = offs;
+
+ /* Now we should have no zeropages remaining. */
+ if (get_my_ksm_zero_pages()) {
+ ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n");
+ goto unmap;
+ }
+
+ /* Check if ksm zero pages are really unmerged */
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "KSM zero pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+static void test_unmerge_discarded(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
+ if (map == MAP_FAILED)
+ return;
+
+ /* Discard half of all mapped pages so we have pte_none() entries. */
+ if (madvise(map, size / 2, MADV_DONTNEED)) {
+ ksft_test_result_fail("MADV_DONTNEED failed\n");
+ goto unmap;
+ }
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_unmerge_uffd_wp(void)
+{
+ struct uffdio_writeprotect uffd_writeprotect;
+ const unsigned int size = 2 * MiB;
+ struct uffdio_api uffdio_api;
+ char *map;
+ int uffd;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
+ if (map == MAP_FAILED)
+ return;
+
+ /* See if UFFD is around. */
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ ksft_test_result_skip("__NR_userfaultfd failed\n");
+ goto unmap;
+ }
+
+ /* See if UFFD-WP is around. */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+ ksft_test_result_fail("UFFDIO_API failed\n");
+ goto close_uffd;
+ }
+ if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
+ ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
+ goto close_uffd;
+ }
+
+ /* Register UFFD-WP, no need for an actual handler. */
+ if (uffd_register(uffd, map, size, false, true, false)) {
+ ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
+ goto close_uffd;
+ }
+
+ /* Write-protect the range using UFFD-WP. */
+ uffd_writeprotect.range.start = (unsigned long) map;
+ uffd_writeprotect.range.len = size;
+ uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+ if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+ ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
+ goto close_uffd;
+ }
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto close_uffd;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+close_uffd:
+ close(uffd);
+unmap:
+ munmap(map, size);
+}
+#endif
+
+/* Verify that KSM can be enabled / queried with prctl. */
+static void test_prctl(void)
+{
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ return;
+ } else if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ return;
+ }
+
+ ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0);
+ if (ret < 0) {
+ ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n");
+ return;
+ } else if (ret != 1) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 not effective\n");
+ return;
+ }
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
+ if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ return;
+ }
+
+ ret = prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0);
+ if (ret < 0) {
+ ksft_test_result_fail("PR_GET_MEMORY_MERGE failed\n");
+ return;
+ } else if (ret != 0) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 not effective\n");
+ return;
+ }
+
+ ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n");
+}
+
+static int test_child_ksm(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ /* Test if KSM is enabled for the process. */
+ if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1)
+ return -1;
+
+ /* Test if merge could really happen. */
+ map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE);
+ if (map == MAP_MERGE_FAIL)
+ return -2;
+ else if (map == MAP_MERGE_SKIP)
+ return -3;
+
+ munmap(map, size);
+ return 0;
+}
+
+static void test_child_ksm_err(int status)
+{
+ if (status == -1)
+ ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n");
+ else if (status == -2)
+ ksft_test_result_fail("Merge in child failed\n");
+ else if (status == -3)
+ ksft_test_result_skip("Merge in child skipped\n");
+}
+
+/* Verify that prctl ksm flag is inherited. */
+static void test_prctl_fork(void)
+{
+ int ret, status;
+ pid_t child_pid;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ return;
+ } else if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ return;
+ }
+
+ child_pid = fork();
+ if (!child_pid) {
+ exit(test_child_ksm());
+ } else if (child_pid < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ return;
+ }
+
+ if (waitpid(child_pid, &status, 0) < 0) {
+ ksft_test_result_fail("waitpid() failed\n");
+ return;
+ }
+
+ status = WEXITSTATUS(status);
+ if (status) {
+ test_child_ksm_err(status);
+ return;
+ }
+
+ if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ return;
+ }
+
+ ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n");
+}
+
+static void test_prctl_fork_exec(void)
+{
+ int ret, status;
+ pid_t child_pid;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ return;
+ } else if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ return;
+ }
+
+ child_pid = fork();
+ if (child_pid == -1) {
+ ksft_test_result_skip("fork() failed\n");
+ return;
+ } else if (child_pid == 0) {
+ char *prg_name = "./ksm_functional_tests";
+ char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME };
+
+ execv(prg_name, argv_for_program);
+ return;
+ }
+
+ if (waitpid(child_pid, &status, 0) > 0) {
+ if (WIFEXITED(status)) {
+ status = WEXITSTATUS(status);
+ if (status) {
+ test_child_ksm_err(status);
+ return;
+ }
+ } else {
+ ksft_test_result_fail("program didn't terminate normally\n");
+ return;
+ }
+ } else {
+ ksft_test_result_fail("waitpid() failed\n");
+ return;
+ }
+
+ if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ return;
+ }
+
+ ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n");
+}
+
+static void test_prctl_unmerge(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL);
+ if (map == MAP_FAILED)
+ return;
+
+ if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+static void test_prot_none(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+ int i;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE);
+ if (map == MAP_FAILED)
+ goto unmap;
+
+ /* Store a unique value in each page on one half using ptrace */
+ for (i = 0; i < size / 2; i += pagesize) {
+ lseek(mem_fd, (uintptr_t) map + i, SEEK_SET);
+ if (write(mem_fd, &i, sizeof(i)) != sizeof(i)) {
+ ksft_test_result_fail("ptrace write failed\n");
+ goto unmap;
+ }
+ }
+
+ /* Trigger unsharing on the other half. */
+ if (madvise(map + size / 2, size / 2, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+static void init_global_file_handles(void)
+{
+ mem_fd = open("/proc/self/mem", O_RDWR);
+ if (mem_fd < 0)
+ ksft_exit_fail_msg("opening /proc/self/mem failed\n");
+ ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+ if (ksm_fd < 0)
+ ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
+ ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
+ if (ksm_full_scans_fd < 0)
+ ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+ proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY);
+ proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages",
+ O_RDONLY);
+ ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR);
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int tests = 8;
+ int err;
+
+ if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) {
+ init_global_file_handles();
+ exit(test_child_ksm());
+ }
+
+#ifdef __NR_userfaultfd
+ tests++;
+#endif
+
+ ksft_print_header();
+ ksft_set_plan(tests);
+
+ pagesize = getpagesize();
+
+ init_global_file_handles();
+
+ test_unmerge();
+ test_unmerge_zero_pages();
+ test_unmerge_discarded();
+#ifdef __NR_userfaultfd
+ test_unmerge_uffd_wp();
+#endif
+
+ test_prot_none();
+
+ test_prctl();
+ test_prctl_fork();
+ test_prctl_fork_exec();
+ test_prctl_unmerge();
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
new file mode 100644
index 000000000000..e80deac1436b
--- /dev/null
+++ b/tools/testing/selftests/mm/ksm_tests.c
@@ -0,0 +1,920 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+#include <numa.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <err.h>
+
+#include "../kselftest.h"
+#include <include/vdso/time64.h>
+#include "vm_util.h"
+
+#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
+#define KSM_FP(s) (KSM_SYSFS_PATH s)
+#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
+#define KSM_PAGE_COUNT_DEFAULT 10l
+#define KSM_PROT_STR_DEFAULT "rw"
+#define KSM_USE_ZERO_PAGES_DEFAULT false
+#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+#define KSM_MERGE_TYPE_DEFAULT 0
+#define MB (1ul << 20)
+
+struct ksm_sysfs {
+ unsigned long max_page_sharing;
+ unsigned long merge_across_nodes;
+ unsigned long pages_to_scan;
+ unsigned long run;
+ unsigned long sleep_millisecs;
+ unsigned long stable_node_chains_prune_millisecs;
+ unsigned long use_zero_pages;
+};
+
+enum ksm_merge_type {
+ KSM_MERGE_MADVISE,
+ KSM_MERGE_PRCTL,
+ KSM_MERGE_LAST = KSM_MERGE_PRCTL
+};
+
+enum ksm_test_name {
+ CHECK_KSM_MERGE,
+ CHECK_KSM_UNMERGE,
+ CHECK_KSM_GET_MERGE_TYPE,
+ CHECK_KSM_ZERO_PAGE_MERGE,
+ CHECK_KSM_NUMA_MERGE,
+ KSM_MERGE_TIME,
+ KSM_MERGE_TIME_HUGE_PAGES,
+ KSM_UNMERGE_TIME,
+ KSM_COW_TIME
+};
+
+int debug;
+
+static int ksm_write_sysfs(const char *file_path, unsigned long val)
+{
+ return write_sysfs(file_path, val);
+}
+
+static int ksm_read_sysfs(const char *file_path, unsigned long *val)
+{
+ return read_sysfs(file_path, val);
+}
+
+static void ksm_print_sysfs(void)
+{
+ unsigned long max_page_sharing, pages_sharing, pages_shared;
+ unsigned long full_scans, pages_unshared, pages_volatile;
+ unsigned long stable_node_chains, stable_node_dups;
+ long general_profit;
+
+ if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+ ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+ ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing) ||
+ ksm_read_sysfs(KSM_FP("full_scans"), &full_scans) ||
+ ksm_read_sysfs(KSM_FP("pages_unshared"), &pages_unshared) ||
+ ksm_read_sysfs(KSM_FP("pages_volatile"), &pages_volatile) ||
+ ksm_read_sysfs(KSM_FP("stable_node_chains"), &stable_node_chains) ||
+ ksm_read_sysfs(KSM_FP("stable_node_dups"), &stable_node_dups) ||
+ ksm_read_sysfs(KSM_FP("general_profit"), (unsigned long *)&general_profit))
+ return;
+
+ printf("pages_shared : %lu\n", pages_shared);
+ printf("pages_sharing : %lu\n", pages_sharing);
+ printf("max_page_sharing : %lu\n", max_page_sharing);
+ printf("full_scans : %lu\n", full_scans);
+ printf("pages_unshared : %lu\n", pages_unshared);
+ printf("pages_volatile : %lu\n", pages_volatile);
+ printf("stable_node_chains: %lu\n", stable_node_chains);
+ printf("stable_node_dups : %lu\n", stable_node_dups);
+ printf("general_profit : %ld\n", general_profit);
+}
+
+static void ksm_print_procfs(void)
+{
+ const char *file_name = "/proc/self/ksm_stat";
+ char buffer[512];
+ FILE *f = fopen(file_name, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_name);
+ perror("fopen");
+ return;
+ }
+
+ while (fgets(buffer, sizeof(buffer), f))
+ printf("%s", buffer);
+
+ fclose(f);
+}
+
+static int str_to_prot(char *prot_str)
+{
+ int prot = 0;
+
+ if ((strchr(prot_str, 'r')) != NULL)
+ prot |= PROT_READ;
+ if ((strchr(prot_str, 'w')) != NULL)
+ prot |= PROT_WRITE;
+ if ((strchr(prot_str, 'x')) != NULL)
+ prot |= PROT_EXEC;
+
+ return prot;
+}
+
+static void print_help(void)
+{
+ printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
+ "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
+
+ printf("Supported <test type>:\n"
+ " -M (page merging)\n"
+ " -Z (zero pages merging)\n"
+ " -N (merging of pages in different NUMA nodes)\n"
+ " -U (page unmerging)\n"
+ " -P evaluate merging time and speed.\n"
+ " For this test, the size of duplicated memory area (in MiB)\n"
+ " must be provided using -s option\n"
+ " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+ " For this test, the size of duplicated memory area (in MiB)\n"
+ " must be provided using -s option\n"
+ " -D evaluate unmerging time and speed when disabling KSM.\n"
+ " For this test, the size of duplicated memory area (in MiB)\n"
+ " must be provided using -s option\n"
+ " -C evaluate the time required to break COW of merged pages.\n\n");
+
+ printf(" -a: specify the access protections of pages.\n"
+ " <prot> must be of the form [rwx].\n"
+ " Default: %s\n", KSM_PROT_STR_DEFAULT);
+ printf(" -p: specify the number of pages to test.\n"
+ " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
+ printf(" -l: limit the maximum running time (in seconds) for a test.\n"
+ " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
+ printf(" -z: change use_zero_pages tunable\n"
+ " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
+ printf(" -m: change merge_across_nodes tunable\n"
+ " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+ printf(" -d: turn debugging output on\n");
+ printf(" -s: the size of duplicated memory area (in MiB)\n");
+ printf(" -t: KSM merge type\n"
+ " Default: 0\n"
+ " 0: madvise merging\n"
+ " 1: prctl merging\n");
+
+ exit(0);
+}
+
+static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
+{
+ void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
+
+ if (!map_ptr) {
+ perror("mmap");
+ return NULL;
+ }
+ memset(map_ptr, data, map_size);
+ if (mprotect(map_ptr, map_size, prot)) {
+ perror("mprotect");
+ munmap(map_ptr, map_size);
+ return NULL;
+ }
+
+ return map_ptr;
+}
+
+static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
+{
+ struct timespec cur_time;
+ unsigned long cur_scan, init_scan;
+
+ if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
+ return 1;
+ cur_scan = init_scan;
+
+ while (cur_scan < init_scan + scan_count) {
+ if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
+ return 1;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
+ perror("clock_gettime");
+ return 1;
+ }
+ if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
+ printf("Scan time limit exceeded\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int ksm_merge_pages(int merge_type, void *addr, size_t size,
+ struct timespec start_time, int timeout)
+{
+ if (merge_type == KSM_MERGE_MADVISE) {
+ if (madvise(addr, size, MADV_MERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ } else if (merge_type == KSM_MERGE_PRCTL) {
+ if (prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0)) {
+ perror("prctl");
+ return 1;
+ }
+ }
+
+ if (ksm_write_sysfs(KSM_FP("run"), 1))
+ return 1;
+
+ /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
+ if (ksm_do_scan(2, start_time, timeout))
+ return 1;
+
+ return 0;
+}
+
+static int ksm_unmerge_pages(void *addr, size_t size,
+ struct timespec start_time, int timeout)
+{
+ if (madvise(addr, size, MADV_UNMERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ return 0;
+}
+
+static bool assert_ksm_pages_count(long dupl_page_count)
+{
+ unsigned long max_page_sharing, pages_sharing, pages_shared;
+
+ if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+ ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+ ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
+ return false;
+
+ if (debug) {
+ ksm_print_sysfs();
+ ksm_print_procfs();
+ }
+
+ /*
+ * Since there must be at least 2 pages for merging and 1 page can be
+ * shared with the limited number of pages (max_page_sharing), sometimes
+ * there are 'leftover' pages that cannot be merged. For example, if there
+ * are 11 pages and max_page_sharing = 10, then only 10 pages will be
+ * merged and the 11th page won't be affected. As a result, when the number
+ * of duplicate pages is divided by max_page_sharing and the remainder is 1,
+ * pages_shared and pages_sharing values will be equal between dupl_page_count
+ * and dupl_page_count - 1.
+ */
+ if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
+ if (pages_shared == dupl_page_count / max_page_sharing &&
+ pages_sharing == pages_shared * (max_page_sharing - 1))
+ return true;
+ } else {
+ if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
+ pages_sharing == dupl_page_count - pages_shared)
+ return true;
+ }
+
+ return false;
+}
+
+static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
+{
+ if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
+ numa_available() ? 0 :
+ ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
+ ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
+ ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
+ ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
+ ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+ &ksm_sysfs->stable_node_chains_prune_millisecs) ||
+ ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
+ return 1;
+
+ return 0;
+}
+
+static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
+{
+ if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
+ numa_available() ? 0 :
+ ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
+ ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
+ ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
+ ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
+ ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+ ksm_sysfs->stable_node_chains_prune_millisecs) ||
+ ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
+ return 1;
+
+ return 0;
+}
+
+static int check_ksm_merge(int merge_type, int mapping, int prot,
+ long page_count, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ /* fill pages with the same data and merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /* verify that the right number of pages are merged */
+ if (assert_ksm_pages_count(page_count)) {
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ if (merge_type == KSM_MERGE_PRCTL)
+ prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
+ return KSFT_PASS;
+ }
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_unmerge(int merge_type, int mapping, int prot, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+ int page_count = 2;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ /* fill pages with the same data and merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
+ memset(map_ptr, '-', 1);
+ memset(map_ptr + page_size, '+', 1);
+
+ /* get at least 1 scan, so KSM can detect that the pages were modified */
+ if (ksm_do_scan(1, start_time, timeout))
+ goto err_out;
+
+ /* check that unmerging was successful and 0 pages are currently merged */
+ if (assert_ksm_pages_count(0)) {
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+ }
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int check_ksm_zero_page_merge(int merge_type, int mapping, int prot, long page_count,
+ int timeout, bool use_zero_pages, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
+ return KSFT_FAIL;
+
+ /* fill pages with zero and try to merge them */
+ map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ /*
+ * verify that the right number of pages are merged:
+ * 1) if use_zero_pages is set to 1, empty pages are merged
+ * with the kernel zero page instead of with each other;
+ * 2) if use_zero_pages is set to 0, empty pages are not treated specially
+ * and merged as usual.
+ */
+ if (use_zero_pages && !assert_ksm_pages_count(0))
+ goto err_out;
+ else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
+ goto err_out;
+
+ printf("OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+static int get_next_mem_node(int node)
+{
+
+ long node_size;
+ int mem_node = 0;
+ int i, max_node = numa_max_node();
+
+ for (i = node + 1; i <= max_node + node; i++) {
+ mem_node = i % (max_node + 1);
+ node_size = numa_node_size(mem_node, NULL);
+ if (node_size > 0)
+ break;
+ }
+ return mem_node;
+}
+
+static int get_first_mem_node(void)
+{
+ return get_next_mem_node(numa_max_node());
+}
+
+static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeout,
+ bool merge_across_nodes, size_t page_size)
+{
+ void *numa1_map_ptr, *numa2_map_ptr;
+ struct timespec start_time;
+ int page_count = 2;
+ int first_node;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ if (numa_available() < 0) {
+ perror("NUMA support not enabled");
+ return KSFT_SKIP;
+ }
+ if (numa_num_configured_nodes() <= 1) {
+ printf("At least 2 NUMA nodes must be available\n");
+ return KSFT_SKIP;
+ }
+ if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
+ return KSFT_FAIL;
+
+ /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
+ first_node = get_first_mem_node();
+ numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
+ numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
+ if (!numa1_map_ptr || !numa2_map_ptr) {
+ perror("numa_alloc_onnode");
+ return KSFT_FAIL;
+ }
+
+ memset(numa1_map_ptr, '*', page_size);
+ memset(numa2_map_ptr, '*', page_size);
+
+ /* try to merge the pages */
+ if (ksm_merge_pages(merge_type, numa1_map_ptr, page_size, start_time, timeout) ||
+ ksm_merge_pages(merge_type, numa2_map_ptr, page_size, start_time, timeout))
+ goto err_out;
+
+ /*
+ * verify that the right number of pages are merged:
+ * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
+ * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
+ * only 1 unique page in each node and they can't be shared.
+ */
+ if (merge_across_nodes && !assert_ksm_pages_count(page_count))
+ goto err_out;
+ else if (!merge_across_nodes && !assert_ksm_pages_count(0))
+ goto err_out;
+
+ numa_free(numa1_map_ptr, page_size);
+ numa_free(numa2_map_ptr, page_size);
+ printf("OK\n");
+ return KSFT_PASS;
+
+err_out:
+ numa_free(numa1_map_ptr, page_size);
+ numa_free(numa2_map_ptr, page_size);
+ printf("Not OK\n");
+ return KSFT_FAIL;
+}
+
+static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot,
+ int timeout, size_t map_size)
+{
+ void *map_ptr, *map_ptr_orig;
+ struct timespec start_time, end_time;
+ unsigned long scan_time_ns;
+ int pagemap_fd, n_normal_pages, n_huge_pages;
+
+ map_size *= MB;
+ size_t len = map_size;
+
+ len -= len % HPAGE_SIZE;
+ map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+ map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE;
+
+ if (map_ptr_orig == MAP_FAILED)
+ err(2, "initial mmap");
+
+ if (madvise(map_ptr, len, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ err(2, "open pagemap");
+
+ n_normal_pages = 0;
+ n_huge_pages = 0;
+ for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) {
+ if (allocate_transhuge(p, pagemap_fd) < 0)
+ n_normal_pages++;
+ else
+ n_huge_pages++;
+ }
+ printf("Number of normal pages: %d\n", n_normal_pages);
+ printf("Number of huge pages: %d\n", n_huge_pages);
+
+ memset(map_ptr, '*', len);
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout))
+ goto err_out;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n", map_size / MB);
+ printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+ scan_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", (map_size / MB) /
+ ((double)scan_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr_orig, len + HPAGE_SIZE);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr_orig, len + HPAGE_SIZE);
+ return KSFT_FAIL;
+}
+
+static int ksm_merge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long scan_time_ns;
+
+ map_size *= MB;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout))
+ goto err_out;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n", map_size / MB);
+ printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+ scan_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", (map_size / MB) /
+ ((double)scan_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, map_size);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, map_size);
+ return KSFT_FAIL;
+}
+
+static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, size_t map_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long scan_time_ns;
+
+ map_size *= MB;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+ if (!map_ptr)
+ return KSFT_FAIL;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout))
+ goto err_out;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
+ goto err_out;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n", map_size / MB);
+ printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+ scan_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", (map_size / MB) /
+ ((double)scan_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, map_size);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, map_size);
+ return KSFT_FAIL;
+}
+
+static int ksm_cow_time(int merge_type, int mapping, int prot, int timeout, size_t page_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long cow_time_ns;
+
+ /* page_count must be less than 2*page_size */
+ size_t page_count = 4000;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+ if (!map_ptr)
+ return KSFT_FAIL;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+ for (size_t i = 0; i < page_count - 1; i = i + 2)
+ memset(map_ptr + page_size * i, '-', 1);
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ return KSFT_FAIL;
+ }
+
+ cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB);
+ printf("Not merged pages:\n");
+ printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+ cow_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
+ ((double)cow_time_ns / NSEC_PER_SEC));
+
+ /* Create 2000 pairs of duplicate pages */
+ for (size_t i = 0; i < page_count - 1; i = i + 2) {
+ memset(map_ptr + page_size * i, '+', i / 2 + 1);
+ memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
+ }
+ if (ksm_merge_pages(merge_type, map_ptr, page_size * page_count, start_time, timeout))
+ goto err_out;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ for (size_t i = 0; i < page_count - 1; i = i + 2)
+ memset(map_ptr + page_size * i, '-', 1);
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Merged pages:\n");
+ printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+ cow_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
+ ((double)cow_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, page_size * page_count);
+ return KSFT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = 0, opt;
+ int prot = 0;
+ int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+ int merge_type = KSM_MERGE_TYPE_DEFAULT;
+ long page_count = KSM_PAGE_COUNT_DEFAULT;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ struct ksm_sysfs ksm_sysfs_old;
+ int test_name = CHECK_KSM_MERGE;
+ bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
+ bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
+ long size_MB = 0;
+
+ while ((opt = getopt(argc, argv, "dha:p:l:z:m:s:t:MUZNPCHD")) != -1) {
+ switch (opt) {
+ case 'a':
+ prot = str_to_prot(optarg);
+ break;
+ case 'p':
+ page_count = atol(optarg);
+ if (page_count <= 0) {
+ printf("The number of pages must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 'l':
+ ksm_scan_limit_sec = atoi(optarg);
+ if (ksm_scan_limit_sec <= 0) {
+ printf("Timeout value must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 'h':
+ print_help();
+ break;
+ case 'z':
+ if (strcmp(optarg, "0") == 0)
+ use_zero_pages = 0;
+ else
+ use_zero_pages = 1;
+ break;
+ case 'm':
+ if (strcmp(optarg, "0") == 0)
+ merge_across_nodes = 0;
+ else
+ merge_across_nodes = 1;
+ break;
+ case 'd':
+ debug = 1;
+ break;
+ case 's':
+ size_MB = atoi(optarg);
+ if (size_MB <= 0) {
+ printf("Size must be greater than 0\n");
+ return KSFT_FAIL;
+ }
+ break;
+ case 't':
+ {
+ int tmp = atoi(optarg);
+
+ if (tmp < 0 || tmp > KSM_MERGE_LAST) {
+ printf("Invalid merge type\n");
+ return KSFT_FAIL;
+ }
+ merge_type = tmp;
+ }
+ break;
+ case 'M':
+ break;
+ case 'U':
+ test_name = CHECK_KSM_UNMERGE;
+ break;
+ case 'Z':
+ test_name = CHECK_KSM_ZERO_PAGE_MERGE;
+ break;
+ case 'N':
+ test_name = CHECK_KSM_NUMA_MERGE;
+ break;
+ case 'P':
+ test_name = KSM_MERGE_TIME;
+ break;
+ case 'H':
+ test_name = KSM_MERGE_TIME_HUGE_PAGES;
+ break;
+ case 'D':
+ test_name = KSM_UNMERGE_TIME;
+ break;
+ case 'C':
+ test_name = KSM_COW_TIME;
+ break;
+ default:
+ return KSFT_FAIL;
+ }
+ }
+
+ if (prot == 0)
+ prot = str_to_prot(KSM_PROT_STR_DEFAULT);
+
+ if (access(KSM_SYSFS_PATH, F_OK)) {
+ printf("Config KSM not enabled\n");
+ return KSFT_SKIP;
+ }
+
+ if (ksm_save_def(&ksm_sysfs_old)) {
+ printf("Cannot save default tunables\n");
+ return KSFT_FAIL;
+ }
+
+ if (ksm_write_sysfs(KSM_FP("run"), 2) ||
+ ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
+ numa_available() ? 0 :
+ ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
+ ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
+ return KSFT_FAIL;
+
+ switch (test_name) {
+ case CHECK_KSM_MERGE:
+ ret = check_ksm_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+ ksm_scan_limit_sec, page_size);
+ break;
+ case CHECK_KSM_UNMERGE:
+ ret = check_ksm_unmerge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, page_size);
+ break;
+ case CHECK_KSM_ZERO_PAGE_MERGE:
+ ret = check_ksm_zero_page_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ page_count, ksm_scan_limit_sec, use_zero_pages,
+ page_size);
+ break;
+ case CHECK_KSM_NUMA_MERGE:
+ ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, merge_across_nodes, page_size);
+ break;
+ case KSM_MERGE_TIME:
+ if (size_MB == 0) {
+ printf("Option '-s' is required.\n");
+ return KSFT_FAIL;
+ }
+ ret = ksm_merge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, size_MB);
+ break;
+ case KSM_MERGE_TIME_HUGE_PAGES:
+ if (size_MB == 0) {
+ printf("Option '-s' is required.\n");
+ return KSFT_FAIL;
+ }
+ ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, size_MB);
+ break;
+ case KSM_UNMERGE_TIME:
+ if (size_MB == 0) {
+ printf("Option '-s' is required.\n");
+ return KSFT_FAIL;
+ }
+ ret = ksm_unmerge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, size_MB);
+ break;
+ case KSM_COW_TIME:
+ ret = ksm_cow_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, page_size);
+ break;
+ }
+
+ if (ksm_restore(&ksm_sysfs_old)) {
+ printf("Cannot restore default tunables\n");
+ return KSFT_FAIL;
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
new file mode 100644
index 000000000000..b6fabd5c27ed
--- /dev/null
+++ b/tools/testing/selftests/mm/madv_populate.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+ *
+ * Copyright 2021, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+/*
+ * For now, we're using 2 MiB of private anonymous memory for all tests.
+ */
+#define SIZE (2 * 1024 * 1024)
+
+static size_t pagesize;
+
+static void sense_support(void)
+{
+ char *addr;
+ int ret;
+
+ addr = mmap(0, pagesize, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (!addr)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ ret = madvise(addr, pagesize, MADV_POPULATE_READ);
+ if (ret)
+ ksft_exit_skip("MADV_POPULATE_READ is not available\n");
+
+ ret = madvise(addr, pagesize, MADV_POPULATE_WRITE);
+ if (ret)
+ ksft_exit_skip("MADV_POPULATE_WRITE is not available\n");
+
+ munmap(addr, pagesize);
+}
+
+static void test_prot_read(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == EINVAL,
+ "MADV_POPULATE_WRITE with PROT_READ\n");
+
+ munmap(addr, SIZE);
+}
+
+static void test_prot_write(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == EINVAL,
+ "MADV_POPULATE_READ with PROT_WRITE\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n");
+
+ munmap(addr, SIZE);
+}
+
+static void test_holes(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ ret = munmap(addr + pagesize, pagesize);
+ if (ret)
+ ksft_exit_fail_msg("munmap failed\n");
+
+ /* Hole in the middle */
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_READ with holes in the middle\n");
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_WRITE with holes in the middle\n");
+
+ /* Hole at end */
+ ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_READ with holes at the end\n");
+ ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_WRITE with holes at the end\n");
+
+ /* Hole at beginning */
+ ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_READ with holes at the beginning\n");
+ ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE);
+ ksft_test_result(ret == -1 && errno == ENOMEM,
+ "MADV_POPULATE_WRITE with holes at the beginning\n");
+
+ munmap(addr, SIZE);
+}
+
+static bool range_is_populated(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (!pagemap_is_populated(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static bool range_is_not_populated(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (pagemap_is_populated(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static void test_populate_read(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ ksft_test_result(range_is_not_populated(addr, SIZE),
+ "read range initially not populated\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+ ksft_test_result(range_is_populated(addr, SIZE),
+ "read range is populated\n");
+
+ munmap(addr, SIZE);
+}
+
+static void test_populate_write(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+ ksft_test_result(range_is_not_populated(addr, SIZE),
+ "write range initially not populated\n");
+
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+ ksft_test_result(range_is_populated(addr, SIZE),
+ "write range is populated\n");
+
+ munmap(addr, SIZE);
+}
+
+static bool range_is_softdirty(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (!pagemap_is_softdirty(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static bool range_is_not_softdirty(char *start, ssize_t size)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+ bool ret = true;
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+ for (; size > 0 && ret; size -= pagesize, start += pagesize)
+ if (pagemap_is_softdirty(fd, start))
+ ret = false;
+ close(fd);
+ return ret;
+}
+
+static void test_softdirty(void)
+{
+ char *addr;
+ int ret;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ /* Clear any softdirty bits. */
+ clear_softdirty();
+ ksft_test_result(range_is_not_softdirty(addr, SIZE),
+ "cleared range is not softdirty\n");
+
+ /* Populating READ should set softdirty. */
+ ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+ ksft_test_result(!ret, "softdirty MADV_POPULATE_READ\n");
+ ksft_test_result(range_is_not_softdirty(addr, SIZE),
+ "range is not softdirty after MADV_POPULATE_READ\n");
+
+ /* Populating WRITE should set softdirty. */
+ ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+ ksft_test_result(!ret, "softdirty MADV_POPULATE_WRITE\n");
+ ksft_test_result(range_is_softdirty(addr, SIZE),
+ "range is softdirty after MADV_POPULATE_WRITE \n");
+
+ munmap(addr, SIZE);
+}
+
+static int system_has_softdirty(void)
+{
+ /*
+ * There is no way to check if the kernel supports soft-dirty, other
+ * than by writing to a page and seeing if the bit was set. But the
+ * tests are intended to check that the bit gets set when it should, so
+ * doing that check would turn a potentially legitimate fail into a
+ * skip. Fortunately, we know for sure that arm64 does not support
+ * soft-dirty. So for now, let's just use the arch as a corse guide.
+ */
+#if defined(__aarch64__)
+ return 0;
+#else
+ return 1;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+ int nr_tests = 16;
+ int err;
+
+ pagesize = getpagesize();
+
+ if (system_has_softdirty())
+ nr_tests += 5;
+
+ ksft_print_header();
+ ksft_set_plan(nr_tests);
+
+ sense_support();
+ test_prot_read();
+ test_prot_write();
+ test_holes();
+ test_populate_read();
+ test_populate_write();
+ if (system_has_softdirty())
+ test_softdirty();
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
new file mode 100644
index 000000000000..1e9980b8993c
--- /dev/null
+++ b/tools/testing/selftests/mm/map_fixed_noreplace.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test that MAP_FIXED_NOREPLACE works.
+ *
+ * Copyright 2018, Jann Horn <jannh@google.com>
+ * Copyright 2018, Michael Ellerman, IBM Corporation.
+ */
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "../kselftest.h"
+
+static void dump_maps(void)
+{
+ char cmd[32];
+
+ snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+ system(cmd);
+}
+
+static unsigned long find_base_addr(unsigned long size)
+{
+ void *addr;
+ unsigned long flags;
+
+ flags = MAP_PRIVATE | MAP_ANONYMOUS;
+ addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
+
+ if (munmap(addr, size) != 0)
+ ksft_exit_fail_msg("Error: munmap failed\n");
+
+ return (unsigned long)addr;
+}
+
+int main(void)
+{
+ unsigned long base_addr;
+ unsigned long flags, addr, size, page_size;
+ char *p;
+
+ ksft_print_header();
+ ksft_set_plan(9);
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ /* let's find a base addr that is free before we start the tests */
+ size = 5 * page_size;
+ base_addr = find_base_addr(size);
+
+ flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+
+ /* Check we can map all the areas we need below */
+ addr = base_addr;
+ size = 5 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p == MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n");
+ }
+ if (munmap((void *)addr, 5 * page_size) != 0) {
+ dump_maps();
+ ksft_exit_fail_msg("Error: munmap failed!?\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() 5*PAGE_SIZE at base\n");
+
+ addr = base_addr + page_size;
+ size = 3 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p == MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() 3*PAGE_SIZE at base+PAGE_SIZE\n");
+
+ /*
+ * Exact same mapping again:
+ * base | free | new
+ * +1 | mapped | new
+ * +2 | mapped | new
+ * +3 | mapped | new
+ * +4 | free | new
+ */
+ addr = base_addr;
+ size = 5 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p != MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("Second mmap() 5*PAGE_SIZE at base\n");
+
+ /*
+ * Second mapping contained within first:
+ *
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped | new
+ * +3 | mapped |
+ * +4 | free |
+ */
+ addr = base_addr + (2 * page_size);
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p != MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+PAGE_SIZE\n");
+
+ /*
+ * Overlap end of existing mapping:
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped | new
+ * +4 | free | new
+ */
+ addr = base_addr + (3 * page_size);
+ size = 2 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p != MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() 2*PAGE_SIZE at base+(3*PAGE_SIZE)\n");
+
+ /*
+ * Overlap start of existing mapping:
+ * base | free | new
+ * +1 | mapped | new
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free |
+ */
+ addr = base_addr;
+ size = 2 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p != MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() 2*PAGE_SIZE bytes at base\n");
+
+ /*
+ * Adjacent to start of existing mapping:
+ * base | free | new
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free |
+ */
+ addr = base_addr;
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p == MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() PAGE_SIZE at base\n");
+
+ /*
+ * Adjacent to end of existing mapping:
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free | new
+ */
+ addr = base_addr + (4 * page_size);
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ if (p == MAP_FAILED) {
+ dump_maps();
+ ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n");
+ }
+ ksft_print_msg("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+ ksft_test_result_pass("mmap() PAGE_SIZE at base+(4*PAGE_SIZE)\n");
+
+ addr = base_addr;
+ size = 5 * page_size;
+ if (munmap((void *)addr, size) != 0) {
+ dump_maps();
+ ksft_exit_fail_msg("Error: munmap failed!?\n");
+ }
+ ksft_test_result_pass("Base Address unmap() successful\n");
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
new file mode 100644
index 000000000000..b47399feab53
--- /dev/null
+++ b/tools/testing/selftests/mm/map_hugetlb.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag. Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+static void check_bytes(char *addr)
+{
+ ksft_print_msg("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t length)
+{
+ unsigned long i;
+
+ for (i = 0; i < length; i++)
+ *(addr + i) = (char)i;
+}
+
+static void read_bytes(char *addr, size_t length)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < length; i++)
+ if (*(addr + i) != (char)i)
+ ksft_exit_fail_msg("Mismatch at %lu\n", i);
+
+ ksft_test_result_pass("Read correct data\n");
+}
+
+int main(int argc, char **argv)
+{
+ void *addr;
+ size_t hugepage_size;
+ size_t length = LENGTH;
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+ int shift = 0;
+
+ hugepage_size = default_huge_page_size();
+ /* munmap with fail if the length is not page aligned */
+ if (hugepage_size > length)
+ length = hugepage_size;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ if (argc > 1)
+ length = atol(argv[1]) << 20;
+ if (argc > 2) {
+ shift = atoi(argv[2]);
+ if (shift)
+ flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ }
+
+ if (shift)
+ ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10));
+ else
+ ksft_print_msg("Default size hugepages\n");
+ ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+
+ addr = mmap(NULL, length, PROTECTION, flags, -1, 0);
+ if (addr == MAP_FAILED)
+ ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+
+ ksft_print_msg("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr, length);
+ read_bytes(addr, length);
+
+ /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+ if (munmap(addr, length))
+ ksft_exit_fail_msg("munmap: %s\n", strerror(errno));
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
new file mode 100644
index 000000000000..9df2636c829b
--- /dev/null
+++ b/tools/testing/selftests/mm/map_populate.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "../kselftest.h"
+
+#include "vm_util.h"
+
+#define MMAP_SZ 4096
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) \
+ ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \
+ __func__, __LINE__, (description), \
+ strerror(errno)); \
+ } while (0)
+
+#define TESTS_IN_CHILD 2
+
+static void parent_f(int sock, unsigned long *smap, int child)
+{
+ int status, ret;
+
+ ret = read(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ *smap = 0x22222BAD;
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = write(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ waitpid(child, &status, 0);
+
+ /* The ksft macros don't keep counters between processes */
+ ksft_cnt.ksft_pass = WEXITSTATUS(status);
+ ksft_cnt.ksft_fail = TESTS_IN_CHILD - WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+ int ret, buf = 0;
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_POPULATE, fd, 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+ ret = write(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ ret = read(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ ksft_test_result(*smap != 0x22222BAD, "MAP_POPULATE COW private page\n");
+ ksft_test_result(*smap == 0xdeadbabe, "The mapping state\n");
+
+ /* The ksft macros don't keep counters between processes */
+ return ksft_cnt.ksft_pass;
+}
+
+int main(int argc, char **argv)
+{
+ int sock[2], child, ret;
+ FILE *ftmp;
+ unsigned long *smap;
+
+ ksft_print_header();
+ ksft_set_plan(TESTS_IN_CHILD);
+
+ ftmp = tmpfile();
+ BUG_ON(!ftmp, "tmpfile()");
+
+ ret = ftruncate(fileno(ftmp), MMAP_SZ);
+ if (ret < 0 && errno == ENOENT) {
+ skip_test_dodgy_fs("ftruncate()");
+ }
+ BUG_ON(ret, "ftruncate()");
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fileno(ftmp), 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ *smap = 0xdeadbabe;
+ /* Probably unnecessary, but let it be. */
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+ BUG_ON(ret, "socketpair()");
+
+ child = fork();
+ BUG_ON(child == -1, "fork()");
+
+ if (child) {
+ ret = close(sock[0]);
+ BUG_ON(ret, "close()");
+
+ parent_f(sock[1], smap, child);
+
+ ksft_finished();
+ }
+
+ ret = close(sock[1]);
+ BUG_ON(ret, "close()");
+
+ return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c
new file mode 100644
index 000000000000..200bedcdc32e
--- /dev/null
+++ b/tools/testing/selftests/mm/mdwe_test.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifdef __aarch64__
+#include <asm/hwcap.h>
+#endif
+
+#include <linux/mman.h>
+#include <linux/prctl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef __aarch64__
+# define PROT_BTI 0
+#endif
+
+TEST(prctl_flags)
+{
+ EXPECT_LT(prctl(PR_SET_MDWE, PR_MDWE_NO_INHERIT, 0L, 0L, 7L), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0);
+ EXPECT_EQ(errno, EINVAL);
+ EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0);
+ EXPECT_EQ(errno, EINVAL);
+ EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0);
+ EXPECT_EQ(errno, EINVAL);
+ EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0);
+ EXPECT_EQ(errno, EINVAL);
+ EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0);
+ EXPECT_EQ(errno, EINVAL);
+ EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0);
+ EXPECT_EQ(errno, EINVAL);
+ EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0);
+ EXPECT_EQ(errno, EINVAL);
+}
+
+FIXTURE(consecutive_prctl_flags) {};
+FIXTURE_SETUP(consecutive_prctl_flags) {}
+FIXTURE_TEARDOWN(consecutive_prctl_flags) {}
+
+FIXTURE_VARIANT(consecutive_prctl_flags)
+{
+ unsigned long first_flags;
+ unsigned long second_flags;
+ bool should_work;
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_no_flags)
+{
+ .first_flags = 0,
+ .second_flags = 0,
+ .should_work = true,
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_exec_gain)
+{
+ .first_flags = PR_MDWE_REFUSE_EXEC_GAIN,
+ .second_flags = PR_MDWE_REFUSE_EXEC_GAIN,
+ .should_work = true,
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_both_flags)
+{
+ .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT,
+ .second_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT,
+ .should_work = true,
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_mdwe)
+{
+ .first_flags = PR_MDWE_REFUSE_EXEC_GAIN,
+ .second_flags = 0,
+ .should_work = false,
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_mdwe_no_inherit)
+{
+ .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT,
+ .second_flags = 0,
+ .should_work = false,
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_no_inherit)
+{
+ .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT,
+ .second_flags = PR_MDWE_REFUSE_EXEC_GAIN,
+ .should_work = false,
+};
+
+FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_enable_no_inherit)
+{
+ .first_flags = PR_MDWE_REFUSE_EXEC_GAIN,
+ .second_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT,
+ .should_work = false,
+};
+
+TEST_F(consecutive_prctl_flags, two_prctls)
+{
+ int ret;
+
+ EXPECT_EQ(prctl(PR_SET_MDWE, variant->first_flags, 0L, 0L, 0L), 0);
+
+ ret = prctl(PR_SET_MDWE, variant->second_flags, 0L, 0L, 0L);
+ if (variant->should_work) {
+ EXPECT_EQ(ret, 0);
+
+ ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L);
+ ASSERT_EQ(ret, variant->second_flags);
+ } else {
+ EXPECT_NE(ret, 0);
+ ASSERT_EQ(errno, EPERM);
+ }
+}
+
+FIXTURE(mdwe)
+{
+ void *p;
+ int flags;
+ size_t size;
+ pid_t pid;
+};
+
+FIXTURE_VARIANT(mdwe)
+{
+ bool enabled;
+ bool forked;
+ bool inherit;
+};
+
+FIXTURE_VARIANT_ADD(mdwe, stock)
+{
+ .enabled = false,
+ .forked = false,
+ .inherit = false,
+};
+
+FIXTURE_VARIANT_ADD(mdwe, enabled)
+{
+ .enabled = true,
+ .forked = false,
+ .inherit = true,
+};
+
+FIXTURE_VARIANT_ADD(mdwe, inherited)
+{
+ .enabled = true,
+ .forked = true,
+ .inherit = true,
+};
+
+FIXTURE_VARIANT_ADD(mdwe, not_inherited)
+{
+ .enabled = true,
+ .forked = true,
+ .inherit = false,
+};
+
+static bool executable_map_should_fail(const FIXTURE_VARIANT(mdwe) *variant)
+{
+ return variant->enabled && (!variant->forked || variant->inherit);
+}
+
+FIXTURE_SETUP(mdwe)
+{
+ unsigned long mdwe_flags;
+ int ret, status;
+
+ self->p = NULL;
+ self->flags = MAP_SHARED | MAP_ANONYMOUS;
+ self->size = getpagesize();
+
+ if (!variant->enabled)
+ return;
+
+ mdwe_flags = PR_MDWE_REFUSE_EXEC_GAIN;
+ if (!variant->inherit)
+ mdwe_flags |= PR_MDWE_NO_INHERIT;
+
+ ret = prctl(PR_SET_MDWE, mdwe_flags, 0L, 0L, 0L);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("PR_SET_MDWE failed or unsupported");
+ }
+
+ ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L);
+ ASSERT_EQ(ret, mdwe_flags);
+
+ if (variant->forked) {
+ self->pid = fork();
+ ASSERT_GE(self->pid, 0) {
+ TH_LOG("fork failed\n");
+ }
+
+ if (self->pid > 0) {
+ ret = waitpid(self->pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ exit(WEXITSTATUS(status));
+ }
+ }
+}
+
+FIXTURE_TEARDOWN(mdwe)
+{
+ if (self->p && self->p != MAP_FAILED)
+ munmap(self->p, self->size);
+}
+
+TEST_F(mdwe, mmap_READ_EXEC)
+{
+ self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0);
+ EXPECT_NE(self->p, MAP_FAILED);
+}
+
+TEST_F(mdwe, mmap_WRITE_EXEC)
+{
+ self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0);
+ if (executable_map_should_fail(variant)) {
+ EXPECT_EQ(self->p, MAP_FAILED);
+ } else {
+ EXPECT_NE(self->p, MAP_FAILED);
+ }
+}
+
+TEST_F(mdwe, mprotect_stay_EXEC)
+{
+ int ret;
+
+ self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0);
+ ASSERT_NE(self->p, MAP_FAILED);
+
+ ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC);
+ EXPECT_EQ(ret, 0);
+}
+
+TEST_F(mdwe, mprotect_add_EXEC)
+{
+ int ret;
+
+ self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0);
+ ASSERT_NE(self->p, MAP_FAILED);
+
+ ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC);
+ if (executable_map_should_fail(variant)) {
+ EXPECT_LT(ret, 0);
+ } else {
+ EXPECT_EQ(ret, 0);
+ }
+}
+
+TEST_F(mdwe, mprotect_WRITE_EXEC)
+{
+ int ret;
+
+ self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0);
+ ASSERT_NE(self->p, MAP_FAILED);
+
+ ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC);
+ if (executable_map_should_fail(variant)) {
+ EXPECT_LT(ret, 0);
+ } else {
+ EXPECT_EQ(ret, 0);
+ }
+}
+
+TEST_F(mdwe, mmap_FIXED)
+{
+ void *p;
+
+ self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0);
+ ASSERT_NE(self->p, MAP_FAILED);
+
+ /* MAP_FIXED unmaps the existing page before mapping which is allowed */
+ p = mmap(self->p, self->size, PROT_READ | PROT_EXEC,
+ self->flags | MAP_FIXED, 0, 0);
+ EXPECT_EQ(p, self->p);
+}
+
+TEST_F(mdwe, arm64_BTI)
+{
+ int ret;
+
+#ifdef __aarch64__
+ if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI))
+#endif
+ SKIP(return, "HWCAP2_BTI not supported");
+
+ self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0);
+ ASSERT_NE(self->p, MAP_FAILED);
+
+ ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI);
+ EXPECT_EQ(ret, 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c
new file mode 100644
index 000000000000..9a0597310a76
--- /dev/null
+++ b/tools/testing/selftests/mm/memfd_secret.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2021
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+#include "../kselftest.h"
+
+#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__)
+#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
+#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
+
+#ifdef __NR_memfd_secret
+
+#define PATTERN 0x55
+
+static const int prot = PROT_READ | PROT_WRITE;
+static const int mode = MAP_SHARED;
+
+static unsigned long page_size;
+static unsigned long mlock_limit_cur;
+static unsigned long mlock_limit_max;
+
+static int memfd_secret(unsigned int flags)
+{
+ return syscall(__NR_memfd_secret, flags);
+}
+
+static void test_file_apis(int fd)
+{
+ char buf[64];
+
+ if ((read(fd, buf, sizeof(buf)) >= 0) ||
+ (write(fd, buf, sizeof(buf)) >= 0) ||
+ (pread(fd, buf, sizeof(buf), 0) >= 0) ||
+ (pwrite(fd, buf, sizeof(buf), 0) >= 0))
+ fail("unexpected file IO\n");
+ else
+ pass("file IO is blocked as expected\n");
+}
+
+static void test_mlock_limit(int fd)
+{
+ size_t len;
+ char *mem;
+
+ len = mlock_limit_cur;
+ if (len % page_size != 0)
+ len = (len/page_size) * page_size;
+
+ mem = mmap(NULL, len, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("unable to mmap secret memory\n");
+ return;
+ }
+ munmap(mem, len);
+
+ len = mlock_limit_max * 2;
+ mem = mmap(NULL, len, prot, mode, fd, 0);
+ if (mem != MAP_FAILED) {
+ fail("unexpected mlock limit violation\n");
+ munmap(mem, len);
+ return;
+ }
+
+ pass("mlock limit is respected\n");
+}
+
+static void test_vmsplice(int fd, const char *desc)
+{
+ ssize_t transferred;
+ struct iovec iov;
+ int pipefd[2];
+ char *mem;
+
+ if (pipe(pipefd)) {
+ fail("pipe failed: %s\n", strerror(errno));
+ return;
+ }
+
+ mem = mmap(NULL, page_size, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("Unable to mmap secret memory\n");
+ goto close_pipe;
+ }
+
+ /*
+ * vmsplice() may use GUP-fast, which must also fail. Prefault the
+ * page table, so GUP-fast could find it.
+ */
+ memset(mem, PATTERN, page_size);
+
+ iov.iov_base = mem;
+ iov.iov_len = page_size;
+ transferred = vmsplice(pipefd[1], &iov, 1, 0);
+
+ if (transferred < 0 && errno == EFAULT)
+ pass("vmsplice is blocked as expected with %s\n", desc);
+ else
+ fail("vmsplice: unexpected memory access with %s\n", desc);
+
+ munmap(mem, page_size);
+close_pipe:
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
+static void try_process_vm_read(int fd, int pipefd[2])
+{
+ struct iovec liov, riov;
+ char buf[64];
+ char *mem;
+
+ if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+ fail("pipe write: %s\n", strerror(errno));
+ exit(KSFT_FAIL);
+ }
+
+ liov.iov_len = riov.iov_len = sizeof(buf);
+ liov.iov_base = buf;
+ riov.iov_base = mem;
+
+ if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) {
+ if (errno == ENOSYS)
+ exit(KSFT_SKIP);
+ exit(KSFT_PASS);
+ }
+
+ exit(KSFT_FAIL);
+}
+
+static void try_ptrace(int fd, int pipefd[2])
+{
+ pid_t ppid = getppid();
+ int status;
+ char *mem;
+ long ret;
+
+ if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+ perror("pipe write");
+ exit(KSFT_FAIL);
+ }
+
+ ret = ptrace(PTRACE_ATTACH, ppid, 0, 0);
+ if (ret) {
+ perror("ptrace_attach");
+ exit(KSFT_FAIL);
+ }
+
+ ret = waitpid(ppid, &status, WUNTRACED);
+ if ((ret != ppid) || !(WIFSTOPPED(status))) {
+ fprintf(stderr, "weird waitppid result %ld stat %x\n",
+ ret, status);
+ exit(KSFT_FAIL);
+ }
+
+ if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0))
+ exit(KSFT_PASS);
+
+ exit(KSFT_FAIL);
+}
+
+static void check_child_status(pid_t pid, const char *name)
+{
+ int status;
+
+ waitpid(pid, &status, 0);
+
+ if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) {
+ skip("%s is not supported\n", name);
+ return;
+ }
+
+ if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) ||
+ WIFSIGNALED(status)) {
+ pass("%s is blocked as expected\n", name);
+ return;
+ }
+
+ fail("%s: unexpected memory access\n", name);
+}
+
+static void test_remote_access(int fd, const char *name,
+ void (*func)(int fd, int pipefd[2]))
+{
+ int pipefd[2];
+ pid_t pid;
+ char *mem;
+
+ if (pipe(pipefd)) {
+ fail("pipe failed: %s\n", strerror(errno));
+ return;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ fail("fork failed: %s\n", strerror(errno));
+ return;
+ }
+
+ if (pid == 0) {
+ func(fd, pipefd);
+ return;
+ }
+
+ mem = mmap(NULL, page_size, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("Unable to mmap secret memory\n");
+ return;
+ }
+
+ memset(mem, PATTERN, page_size);
+
+ if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
+ fail("pipe write: %s\n", strerror(errno));
+ return;
+ }
+
+ check_child_status(pid, name);
+}
+
+static void test_process_vm_read(int fd)
+{
+ test_remote_access(fd, "process_vm_read", try_process_vm_read);
+}
+
+static void test_ptrace(int fd)
+{
+ test_remote_access(fd, "ptrace", try_ptrace);
+}
+
+static int set_cap_limits(rlim_t max)
+{
+ struct rlimit new;
+ cap_t cap = cap_init();
+
+ new.rlim_cur = max;
+ new.rlim_max = max;
+ if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+ perror("setrlimit() returns error");
+ return -1;
+ }
+
+ /* drop capabilities including CAP_IPC_LOCK */
+ if (cap_set_proc(cap)) {
+ perror("cap_set_proc() returns error");
+ return -2;
+ }
+
+ return 0;
+}
+
+static void prepare(void)
+{
+ struct rlimit rlim;
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if (!page_size)
+ ksft_exit_fail_msg("Failed to get page size %s\n",
+ strerror(errno));
+
+ if (getrlimit(RLIMIT_MEMLOCK, &rlim))
+ ksft_exit_fail_msg("Unable to detect mlock limit: %s\n",
+ strerror(errno));
+
+ mlock_limit_cur = rlim.rlim_cur;
+ mlock_limit_max = rlim.rlim_max;
+
+ printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n",
+ page_size, mlock_limit_cur, mlock_limit_max);
+
+ if (page_size > mlock_limit_cur)
+ mlock_limit_cur = page_size;
+ if (page_size > mlock_limit_max)
+ mlock_limit_max = page_size;
+
+ if (set_cap_limits(mlock_limit_max))
+ ksft_exit_fail_msg("Unable to set mlock limit: %s\n",
+ strerror(errno));
+}
+
+#define NUM_TESTS 6
+
+int main(int argc, char *argv[])
+{
+ int fd;
+
+ prepare();
+
+ ksft_print_header();
+ ksft_set_plan(NUM_TESTS);
+
+ fd = memfd_secret(0);
+ if (fd < 0) {
+ if (errno == ENOSYS)
+ ksft_exit_skip("memfd_secret is not supported\n");
+ else
+ ksft_exit_fail_msg("memfd_secret failed: %s\n",
+ strerror(errno));
+ }
+ if (ftruncate(fd, page_size))
+ ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno));
+
+ test_mlock_limit(fd);
+ test_file_apis(fd);
+ /*
+ * We have to run the first vmsplice test before any secretmem page was
+ * allocated for this fd.
+ */
+ test_vmsplice(fd, "fresh page");
+ test_vmsplice(fd, "existing page");
+ test_process_vm_read(fd);
+ test_ptrace(fd);
+
+ close(fd);
+
+ ksft_finished();
+}
+
+#else /* __NR_memfd_secret */
+
+int main(int argc, char *argv[])
+{
+ printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
new file mode 100644
index 000000000000..cc26480098ae
--- /dev/null
+++ b/tools/testing/selftests/mm/merge.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#define _GNU_SOURCE
+#include "../kselftest_harness.h"
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <linux/perf_event.h>
+#include "vm_util.h"
+
+FIXTURE(merge)
+{
+ unsigned int page_size;
+ char *carveout;
+ struct procmap_fd procmap;
+};
+
+FIXTURE_SETUP(merge)
+{
+ self->page_size = psize();
+ /* Carve out PROT_NONE region to map over. */
+ self->carveout = mmap(NULL, 12 * self->page_size, PROT_NONE,
+ MAP_ANON | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(self->carveout, MAP_FAILED);
+ /* Setup PROCMAP_QUERY interface. */
+ ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+}
+
+FIXTURE_TEARDOWN(merge)
+{
+ ASSERT_EQ(munmap(self->carveout, 12 * self->page_size), 0);
+ ASSERT_EQ(close_procmap(&self->procmap), 0);
+}
+
+TEST_F(merge, mprotect_unfaulted_left)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * Map 10 pages of R/W memory within. MAP_NORESERVE so we don't hit
+ * merge failure due to lack of VM_ACCOUNT flag by mistake.
+ *
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the first 5 pages read-only, splitting the VMA:
+ *
+ * RO RW
+ * |-----------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the last 5 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RO RW
+ * |-----------|-----------|
+ * | unfaulted | faulted |
+ * |-----------|-----------|
+ */
+ ptr[5 * page_size] = 'x';
+ /*
+ * Now mprotect() the RW region read-only, we should merge (though for
+ * ~15 years we did not! :):
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size);
+}
+
+TEST_F(merge, mprotect_unfaulted_right)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[page_size], 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the last 5 pages read-only, splitting the VMA:
+ *
+ * RW RO
+ * |-----------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(&ptr[5 * page_size], 5 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the first 5 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RW RO
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ */
+ ptr[0] = 'x';
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 10 * page_size);
+}
+
+TEST_F(merge, mprotect_unfaulted_both)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the first and last 3 pages read-only, splitting the VMA:
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | unfaulted | unfaulted | unfaulted |
+ * |-----------|-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the middle 3 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | unfaulted | faulted | unfaulted |
+ * |-----------|-----------|-----------|
+ */
+ ptr[3 * page_size] = 'x';
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size);
+}
+
+TEST_F(merge, mprotect_faulted_left_unfaulted_right)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the last 3 pages read-only, splitting the VMA:
+ *
+ * RW RO
+ * |-----------------------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------------------|-----------|
+ */
+ ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the first 6 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RW RO
+ * |-----------------------|-----------|
+ * | unfaulted | unfaulted |
+ * |-----------------------|-----------|
+ */
+ ptr[0] = 'x';
+ /*
+ * Now make the first 3 pages read-only, splitting the VMA:
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | faulted | faulted | unfaulted |
+ * |-----------|-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size);
+}
+
+TEST_F(merge, mprotect_unfaulted_left_faulted_right)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ char *ptr;
+
+ /*
+ * |-----------------------|
+ * | unfaulted |
+ * |-----------------------|
+ */
+ ptr = mmap(&carveout[2 * page_size], 9 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+ /*
+ * Now make the first 3 pages read-only, splitting the VMA:
+ *
+ * RO RW
+ * |-----------|-----------------------|
+ * | unfaulted | unfaulted |
+ * |-----------|-----------------------|
+ */
+ ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0);
+ /*
+ * Fault in the first of the last 6 pages so it gets an anon_vma and
+ * thus the whole VMA becomes 'faulted':
+ *
+ * RO RW
+ * |-----------|-----------------------|
+ * | unfaulted | faulted |
+ * |-----------|-----------------------|
+ */
+ ptr[3 * page_size] = 'x';
+ /*
+ * Now make the last 3 pages read-only, splitting the VMA:
+ *
+ * RO RW RO
+ * |-----------|-----------|-----------|
+ * | unfaulted | faulted | faulted |
+ * |-----------|-----------|-----------|
+ */
+ ASSERT_EQ(mprotect(&ptr[6 * page_size], 3 * page_size, PROT_READ), 0);
+ /*
+ * Now mprotect() the RW region read-only, we should merge:
+ *
+ * RO
+ * |-----------------------|
+ * | faulted |
+ * |-----------------------|
+ */
+ ASSERT_EQ(mprotect(&ptr[3 * page_size], 3 * page_size, PROT_READ), 0);
+
+ /* Assert that the merge succeeded using PROCMAP_QUERY. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 9 * page_size);
+}
+
+TEST_F(merge, forked_target_vma)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ pid_t pid;
+ char *ptr, *ptr2;
+ int i;
+
+ /*
+ * |-----------|
+ * | unfaulted |
+ * |-----------|
+ */
+ ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Fault in process.
+ *
+ * |-----------|
+ * | faulted |
+ * |-----------|
+ */
+ ptr[0] = 'x';
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+
+ if (pid != 0) {
+ wait(NULL);
+ return;
+ }
+
+ /* Child process below: */
+
+ /* Reopen for child. */
+ ASSERT_EQ(close_procmap(&self->procmap), 0);
+ ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+
+ /* unCOWing everything does not cause the AVC to go away. */
+ for (i = 0; i < 5 * page_size; i += page_size)
+ ptr[i] = 'x';
+
+ /*
+ * Map in adjacent VMA in child.
+ *
+ * forked
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ * ptr ptr2
+ */
+ ptr2 = mmap(&ptr[5 * page_size], 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ /* Make sure not merged. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 5 * page_size);
+}
+
+TEST_F(merge, forked_source_vma)
+{
+ unsigned int page_size = self->page_size;
+ char *carveout = self->carveout;
+ struct procmap_fd *procmap = &self->procmap;
+ pid_t pid;
+ char *ptr, *ptr2;
+ int i;
+
+ /*
+ * |-----------|------------|
+ * | unfaulted | <unmapped> |
+ * |-----------|------------|
+ */
+ ptr = mmap(&carveout[page_size], 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ /*
+ * Fault in process.
+ *
+ * |-----------|------------|
+ * | faulted | <unmapped> |
+ * |-----------|------------|
+ */
+ ptr[0] = 'x';
+
+ pid = fork();
+ ASSERT_NE(pid, -1);
+
+ if (pid != 0) {
+ wait(NULL);
+ return;
+ }
+
+ /* Child process below: */
+
+ /* Reopen for child. */
+ ASSERT_EQ(close_procmap(&self->procmap), 0);
+ ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+
+ /* unCOWing everything does not cause the AVC to go away. */
+ for (i = 0; i < 5 * page_size; i += page_size)
+ ptr[i] = 'x';
+
+ /*
+ * Map in adjacent VMA in child, ptr2 after ptr, but incompatible.
+ *
+ * forked RW RWX
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ * ptr ptr2
+ */
+ ptr2 = mmap(&carveout[6 * page_size], 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANON | MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE, -1, 0);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ /* Make sure not merged. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr2));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
+
+ /*
+ * Now mprotect forked region to RWX so it becomes the source for the
+ * merge to unfaulted region:
+ *
+ * forked RWX RWX
+ * |-----------|-----------|
+ * | faulted | unfaulted |
+ * |-----------|-----------|
+ * ptr ptr2
+ *
+ * This should NOT result in a merge, as ptr was forked.
+ */
+ ASSERT_EQ(mprotect(ptr, 5 * page_size, PROT_READ | PROT_WRITE | PROT_EXEC), 0);
+ /* Again, make sure not merged. */
+ ASSERT_TRUE(find_vma_procmap(procmap, ptr2));
+ ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr2);
+ ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr2 + 5 * page_size);
+}
+
+TEST_F(merge, handle_uprobe_upon_merged_vma)
+{
+ const size_t attr_sz = sizeof(struct perf_event_attr);
+ unsigned int page_size = self->page_size;
+ const char *probe_file = "./foo";
+ char *carveout = self->carveout;
+ struct perf_event_attr attr;
+ unsigned long type;
+ void *ptr1, *ptr2;
+ int fd;
+
+ fd = open(probe_file, O_RDWR|O_CREAT, 0600);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_EQ(ftruncate(fd, page_size), 0);
+ if (read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) != 0) {
+ SKIP(goto out, "Failed to read uprobe sysfs file, skipping");
+ }
+
+ memset(&attr, 0, attr_sz);
+ attr.size = attr_sz;
+ attr.type = type;
+ attr.config1 = (__u64)(long)probe_file;
+ attr.config2 = 0x0;
+
+ ASSERT_GE(syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0), 0);
+
+ ptr1 = mmap(&carveout[page_size], 10 * page_size, PROT_EXEC,
+ MAP_PRIVATE | MAP_FIXED, fd, 0);
+ ASSERT_NE(ptr1, MAP_FAILED);
+
+ ptr2 = mremap(ptr1, page_size, 2 * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr1 + 5 * page_size);
+ ASSERT_NE(ptr2, MAP_FAILED);
+
+ ASSERT_NE(mremap(ptr2, page_size, page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED);
+
+out:
+ close(fd);
+ remove(probe_file);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
new file mode 100644
index 000000000000..1e3a595fbf01
--- /dev/null
+++ b/tools/testing/selftests/mm/migration.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The main purpose of the tests here is to exercise the migration entry code
+ * paths in the kernel.
+ */
+
+#include "../kselftest_harness.h"
+#include <strings.h>
+#include <pthread.h>
+#include <numa.h>
+#include <numaif.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <time.h>
+
+#define TWOMEG (2<<20)
+#define RUNTIME (20)
+#define MAX_RETRIES 100
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+
+FIXTURE(migration)
+{
+ pthread_t *threads;
+ pid_t *pids;
+ int nthreads;
+ int n1;
+ int n2;
+};
+
+FIXTURE_SETUP(migration)
+{
+ int n;
+
+ ASSERT_EQ(numa_available(), 0);
+ self->nthreads = numa_num_task_cpus() - 1;
+ self->n1 = -1;
+ self->n2 = -1;
+
+ for (n = 0; n < numa_max_possible_node(); n++)
+ if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) {
+ if (self->n1 == -1) {
+ self->n1 = n;
+ } else {
+ self->n2 = n;
+ break;
+ }
+ }
+
+ self->threads = malloc(self->nthreads * sizeof(*self->threads));
+ ASSERT_NE(self->threads, NULL);
+ self->pids = malloc(self->nthreads * sizeof(*self->pids));
+ ASSERT_NE(self->pids, NULL);
+};
+
+FIXTURE_TEARDOWN(migration)
+{
+ free(self->threads);
+ free(self->pids);
+}
+
+int migrate(uint64_t *ptr, int n1, int n2)
+{
+ int ret, tmp;
+ int status = 0;
+ struct timespec ts1, ts2;
+ int failures = 0;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &ts1))
+ return -1;
+
+ while (1) {
+ if (clock_gettime(CLOCK_MONOTONIC, &ts2))
+ return -1;
+
+ if (ts2.tv_sec - ts1.tv_sec >= RUNTIME)
+ return 0;
+
+ ret = move_pages(0, 1, (void **) &ptr, &n2, &status,
+ MPOL_MF_MOVE_ALL);
+ if (ret) {
+ if (ret > 0) {
+ /* Migration is best effort; try again */
+ if (++failures < MAX_RETRIES)
+ continue;
+ printf("Didn't migrate %d pages\n", ret);
+ }
+ else
+ perror("Couldn't migrate pages");
+ return -2;
+ }
+ failures = 0;
+ tmp = n2;
+ n2 = n1;
+ n1 = tmp;
+ }
+
+ return 0;
+}
+
+void *access_mem(void *ptr)
+{
+ volatile uint64_t y = 0;
+ volatile uint64_t *x = ptr;
+
+ while (1) {
+ pthread_testcancel();
+ y += *x;
+
+ /* Prevent the compiler from optimizing out the writes to y: */
+ asm volatile("" : "+r" (y));
+ }
+
+ return NULL;
+}
+
+/*
+ * Basic migration entry testing. One thread will move pages back and forth
+ * between nodes whilst other threads try and access them triggering the
+ * migration entry wait paths in the kernel.
+ */
+TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME)
+{
+ uint64_t *ptr;
+ int i;
+
+ if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+ SKIP(return, "Not enough threads or NUMA nodes available");
+
+ ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ memset(ptr, 0xde, TWOMEG);
+ for (i = 0; i < self->nthreads - 1; i++)
+ if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+ perror("Couldn't create thread");
+
+ ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+ for (i = 0; i < self->nthreads - 1; i++)
+ ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+/*
+ * Same as the previous test but with shared memory.
+ */
+TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME)
+{
+ pid_t pid;
+ uint64_t *ptr;
+ int i;
+
+ if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+ SKIP(return, "Not enough threads or NUMA nodes available");
+
+ ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ memset(ptr, 0xde, TWOMEG);
+ for (i = 0; i < self->nthreads - 1; i++) {
+ pid = fork();
+ if (!pid) {
+ prctl(PR_SET_PDEATHSIG, SIGHUP);
+ /* Parent may have died before prctl so check now. */
+ if (getppid() == 1)
+ kill(getpid(), SIGHUP);
+ access_mem(ptr);
+ } else {
+ self->pids[i] = pid;
+ }
+ }
+
+ ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+ for (i = 0; i < self->nthreads - 1; i++)
+ ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
+/*
+ * Tests the pmd migration entry paths.
+ */
+TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME)
+{
+ uint64_t *ptr;
+ int i;
+
+ if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+ SKIP(return, "Not enough threads or NUMA nodes available");
+
+ ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
+ ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
+ memset(ptr, 0xde, TWOMEG);
+ for (i = 0; i < self->nthreads - 1; i++)
+ if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+ perror("Couldn't create thread");
+
+ ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+ for (i = 0; i < self->nthreads - 1; i++)
+ ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+/*
+ * migration test with shared anon THP page
+ */
+
+TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME)
+{
+ pid_t pid;
+ uint64_t *ptr;
+ int i;
+
+ if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+ SKIP(return, "Not enough threads or NUMA nodes available");
+
+ ptr = mmap(NULL, 2 * TWOMEG, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
+ ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
+
+ memset(ptr, 0xde, TWOMEG);
+ for (i = 0; i < self->nthreads - 1; i++) {
+ pid = fork();
+ if (!pid) {
+ prctl(PR_SET_PDEATHSIG, SIGHUP);
+ /* Parent may have died before prctl so check now. */
+ if (getppid() == 1)
+ kill(getpid(), SIGHUP);
+ access_mem(ptr);
+ } else {
+ self->pids[i] = pid;
+ }
+ }
+
+ ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+ for (i = 0; i < self->nthreads - 1; i++)
+ ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
+/*
+ * migration test with private anon hugetlb page
+ */
+TEST_F_TIMEOUT(migration, private_anon_htlb, 2*RUNTIME)
+{
+ uint64_t *ptr;
+ int i;
+
+ if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+ SKIP(return, "Not enough threads or NUMA nodes available");
+
+ ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ memset(ptr, 0xde, TWOMEG);
+ for (i = 0; i < self->nthreads - 1; i++)
+ if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+ perror("Couldn't create thread");
+
+ ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+ for (i = 0; i < self->nthreads - 1; i++)
+ ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+/*
+ * migration test with shared anon hugetlb page
+ */
+TEST_F_TIMEOUT(migration, shared_anon_htlb, 2*RUNTIME)
+{
+ pid_t pid;
+ uint64_t *ptr;
+ int i;
+
+ if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+ SKIP(return, "Not enough threads or NUMA nodes available");
+
+ ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+ ASSERT_NE(ptr, MAP_FAILED);
+
+ memset(ptr, 0xde, TWOMEG);
+ for (i = 0; i < self->nthreads - 1; i++) {
+ pid = fork();
+ if (!pid) {
+ prctl(PR_SET_PDEATHSIG, SIGHUP);
+ /* Parent may have died before prctl so check now. */
+ if (getppid() == 1)
+ kill(getpid(), SIGHUP);
+ access_mem(ptr);
+ } else {
+ self->pids[i] = pid;
+ }
+ }
+
+ ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+ for (i = 0; i < self->nthreads - 1; i++)
+ ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c
new file mode 100644
index 000000000000..09feeb453646
--- /dev/null
+++ b/tools/testing/selftests/mm/mkdirty.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test handling of code that might set PTE/PMD dirty in read-only VMAs.
+ * Setting a PTE/PMD dirty must not accidentally set the PTE/PMD writable.
+ *
+ * Copyright 2023, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <setjmp.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+#include <linux/mempolicy.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+static size_t pagesize;
+static size_t thpsize;
+static int mem_fd;
+static int pagemap_fd;
+static sigjmp_buf env;
+
+static void signal_handler(int sig)
+{
+ if (sig == SIGSEGV)
+ siglongjmp(env, 1);
+ siglongjmp(env, 2);
+}
+
+static void do_test_write_sigsegv(char *mem)
+{
+ char orig = *mem;
+ int ret;
+
+ if (signal(SIGSEGV, signal_handler) == SIG_ERR) {
+ ksft_test_result_fail("signal() failed\n");
+ return;
+ }
+
+ ret = sigsetjmp(env, 1);
+ if (!ret)
+ *mem = orig + 1;
+
+ if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
+ ksft_test_result_fail("signal() failed\n");
+
+ ksft_test_result(ret == 1 && *mem == orig,
+ "SIGSEGV generated, page not modified\n");
+}
+
+static char *mmap_thp_range(int prot, char **_mmap_mem, size_t *_mmap_size)
+{
+ const size_t mmap_size = 2 * thpsize;
+ char *mem, *mmap_mem;
+
+ mmap_mem = mmap(NULL, mmap_size, prot, MAP_PRIVATE|MAP_ANON,
+ -1, 0);
+ if (mmap_mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return MAP_FAILED;
+ }
+ mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+ if (madvise(mem, thpsize, MADV_HUGEPAGE)) {
+ ksft_test_result_skip("MADV_HUGEPAGE failed\n");
+ munmap(mmap_mem, mmap_size);
+ return MAP_FAILED;
+ }
+
+ *_mmap_mem = mmap_mem;
+ *_mmap_size = mmap_size;
+ return mem;
+}
+
+static void test_ptrace_write(void)
+{
+ char data = 1;
+ char *mem;
+ int ret;
+
+ ksft_print_msg("[INFO] PTRACE write access\n");
+
+ mem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ /* Fault in the shared zeropage. */
+ if (*mem != 0) {
+ ksft_test_result_fail("Memory not zero\n");
+ goto munmap;
+ }
+
+ /*
+ * Unshare the page (populating a fresh anon page that might be set
+ * dirty in the PTE) in the read-only VMA using ptrace (FOLL_FORCE).
+ */
+ lseek(mem_fd, (uintptr_t) mem, SEEK_SET);
+ ret = write(mem_fd, &data, 1);
+ if (ret != 1 || *mem != data) {
+ ksft_test_result_fail("write() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void test_ptrace_write_thp(void)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+ char data = 1;
+ int ret;
+
+ ksft_print_msg("[INFO] PTRACE write access to THP\n");
+
+ mem = mmap_thp_range(PROT_READ, &mmap_mem, &mmap_size);
+ if (mem == MAP_FAILED)
+ return;
+
+ /*
+ * Write to the first subpage in the read-only VMA using
+ * ptrace(FOLL_FORCE), eventually placing a fresh THP that is marked
+ * dirty in the PMD.
+ */
+ lseek(mem_fd, (uintptr_t) mem, SEEK_SET);
+ ret = write(mem_fd, &data, 1);
+ if (ret != 1 || *mem != data) {
+ ksft_test_result_fail("write() failed\n");
+ goto munmap;
+ }
+
+ /* MM populated a THP if we got the last subpage populated as well. */
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mmap_mem, mmap_size);
+}
+
+static void test_page_migration(void)
+{
+ char *mem;
+
+ ksft_print_msg("[INFO] Page migration\n");
+
+ mem = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON,
+ -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ /* Populate a fresh page and dirty it. */
+ memset(mem, 1, pagesize);
+ if (mprotect(mem, pagesize, PROT_READ)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ /* Trigger page migration. Might not be available or fail. */
+ if (syscall(__NR_mbind, mem, pagesize, MPOL_LOCAL, NULL, 0x7fful,
+ MPOL_MF_MOVE)) {
+ ksft_test_result_skip("mbind() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void test_page_migration_thp(void)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+
+ ksft_print_msg("[INFO] Page migration of THP\n");
+
+ mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size);
+ if (mem == MAP_FAILED)
+ return;
+
+ /*
+ * Write to the first page, which might populate a fresh anon THP
+ * and dirty it.
+ */
+ memset(mem, 1, pagesize);
+ if (mprotect(mem, thpsize, PROT_READ)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ /* MM populated a THP if we got the last subpage populated as well. */
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+
+ /* Trigger page migration. Might not be available or fail. */
+ if (syscall(__NR_mbind, mem, thpsize, MPOL_LOCAL, NULL, 0x7fful,
+ MPOL_MF_MOVE)) {
+ ksft_test_result_skip("mbind() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mmap_mem, mmap_size);
+}
+
+static void test_pte_mapped_thp(void)
+{
+ char *mem, *mmap_mem;
+ size_t mmap_size;
+
+ ksft_print_msg("[INFO] PTE-mapping a THP\n");
+
+ mem = mmap_thp_range(PROT_READ|PROT_WRITE, &mmap_mem, &mmap_size);
+ if (mem == MAP_FAILED)
+ return;
+
+ /*
+ * Write to the first page, which might populate a fresh anon THP
+ * and dirty it.
+ */
+ memset(mem, 1, pagesize);
+ if (mprotect(mem, thpsize, PROT_READ)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ /* MM populated a THP if we got the last subpage populated as well. */
+ if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+
+ /* Trigger PTE-mapping the THP by mprotect'ing the last subpage. */
+ if (mprotect(mem + thpsize - pagesize, pagesize,
+ PROT_READ|PROT_WRITE)) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+
+ do_test_write_sigsegv(mem);
+munmap:
+ munmap(mmap_mem, mmap_size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_uffdio_copy(void)
+{
+ struct uffdio_register uffdio_register;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_api uffdio_api;
+ char *dst, *src;
+ int uffd;
+
+ ksft_print_msg("[INFO] UFFDIO_COPY\n");
+
+ src = malloc(pagesize);
+ memset(src, 1, pagesize);
+ dst = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (dst == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ free(src);
+ return;
+ }
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ ksft_test_result_skip("__NR_userfaultfd failed\n");
+ goto munmap;
+ }
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+ ksft_test_result_fail("UFFDIO_API failed\n");
+ goto close_uffd;
+ }
+
+ uffdio_register.range.start = (unsigned long) dst;
+ uffdio_register.range.len = pagesize;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ ksft_test_result_fail("UFFDIO_REGISTER failed\n");
+ goto close_uffd;
+ }
+
+ /* Place a page in a read-only VMA, which might set the PTE dirty. */
+ uffdio_copy.dst = (unsigned long) dst;
+ uffdio_copy.src = (unsigned long) src;
+ uffdio_copy.len = pagesize;
+ uffdio_copy.mode = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+ ksft_test_result_fail("UFFDIO_COPY failed\n");
+ goto close_uffd;
+ }
+
+ do_test_write_sigsegv(dst);
+close_uffd:
+ close(uffd);
+munmap:
+ munmap(dst, pagesize);
+ free(src);
+}
+#endif /* __NR_userfaultfd */
+
+int main(void)
+{
+ int err, tests = 2;
+
+ pagesize = getpagesize();
+ thpsize = read_pmd_pagesize();
+ if (thpsize) {
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+ thpsize / 1024);
+ tests += 3;
+ }
+#ifdef __NR_userfaultfd
+ tests += 1;
+#endif /* __NR_userfaultfd */
+
+ ksft_print_header();
+ ksft_set_plan(tests);
+
+ mem_fd = open("/proc/self/mem", O_RDWR);
+ if (mem_fd < 0)
+ ksft_exit_fail_msg("opening /proc/self/mem failed\n");
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("opening /proc/self/pagemap failed\n");
+
+ /*
+ * On some ptrace(FOLL_FORCE) write access via /proc/self/mem in
+ * read-only VMAs, the kernel may set the PTE/PMD dirty.
+ */
+ test_ptrace_write();
+ if (thpsize)
+ test_ptrace_write_thp();
+ /*
+ * On page migration, the kernel may set the PTE/PMD dirty when
+ * remapping the page.
+ */
+ test_page_migration();
+ if (thpsize)
+ test_page_migration_thp();
+ /* PTE-mapping a THP might propagate the dirty PMD bit to the PTEs. */
+ if (thpsize)
+ test_pte_mapped_thp();
+ /* Placing a fresh page via userfaultfd may set the PTE dirty. */
+#ifdef __NR_userfaultfd
+ test_uffdio_copy();
+#endif /* __NR_userfaultfd */
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
new file mode 100644
index 000000000000..b8d7e966f44c
--- /dev/null
+++ b/tools/testing/selftests/mm/mlock-random-test.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "../kselftest.h"
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+ struct rlimit new;
+ cap_t cap = cap_init();
+
+ new.rlim_cur = max;
+ new.rlim_max = max;
+ if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+ ksft_perror("setrlimit() returns error\n");
+ return -1;
+ }
+
+ /* drop capabilities including CAP_IPC_LOCK */
+ if (cap_set_proc(cap)) {
+ ksft_perror("cap_set_proc() returns error\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+ FILE *f;
+ int ret = -1;
+ char line[1024] = {0};
+ unsigned long lock_size = 0;
+
+ f = fopen("/proc/self/status", "r");
+ if (!f)
+ ksft_exit_fail_msg("fopen: %s\n", strerror(errno));
+
+ while (fgets(line, 1024, f)) {
+ if (strstr(line, "VmLck")) {
+ ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+ if (ret <= 0) {
+ fclose(f);
+ ksft_exit_fail_msg("sscanf() on VmLck error: %s: %d\n",
+ line, ret);
+ }
+ fclose(f);
+ return (int)(lock_size << 10);
+ }
+ }
+
+ fclose(f);
+ ksft_exit_fail_msg("cannot parse VmLck in /proc/self/status: %s\n", strerror(errno));
+ return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+ FILE *smaps;
+ char *line;
+ unsigned long mmupage_size = 0;
+ size_t size;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps)
+ ksft_exit_fail_msg("Unable to parse /proc/self/smaps\n");
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, "MMUPageSize")) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ /* found the MMUPageSize of this section */
+ if (sscanf(line, "MMUPageSize: %8lu kB", &mmupage_size) < 1)
+ ksft_exit_fail_msg("Unable to parse smaps entry for Size:%s\n",
+ line);
+
+ }
+ free(line);
+ if (smaps)
+ fclose(smaps);
+ return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start + len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+static void test_mlock_within_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0;
+ struct rlimit cur;
+ int page_size = 0;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur < alloc_size)
+ ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ /*
+ * - choose mlock/mlock2 randomly
+ * - choose lock_size randomly but lock_size < alloc_size
+ * - choose start_offset randomly but p+start_offset+lock_size
+ * < p+alloc_size
+ */
+ int is_mlock = !!(rand() % 2);
+ int lock_size = rand() % alloc_size;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+
+ if (ret)
+ ksft_exit_fail_msg("%s() failure (%s) at |%p(%d)| mlock:|%p(%d)|\n",
+ is_mlock ? "mlock" : "mlock2",
+ strerror(errno), p, alloc_size,
+ p + start_offset, lock_size);
+ }
+
+ /*
+ * Check VmLck left by the tests.
+ */
+ locked_vm_size = get_proc_locked_vm_size();
+ page_size = get_proc_page_size((unsigned long)p);
+
+ if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size)
+ ksft_exit_fail_msg("%s left VmLck:%d on %d chunk\n",
+ __func__, locked_vm_size, alloc_size);
+
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+static void test_mlock_outof_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0, old_locked_vm_size = 0;
+ struct rlimit cur;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur >= alloc_size)
+ ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+
+ old_locked_vm_size = get_proc_locked_vm_size();
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ int is_mlock = !!(rand() % 2);
+ int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+ + cur.rlim_cur;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+ if (ret == 0)
+ ksft_exit_fail_msg("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size, p + start_offset, lock_size);
+ }
+
+ locked_vm_size = get_proc_locked_vm_size();
+ if (locked_vm_size != old_locked_vm_size)
+ ksft_exit_fail_msg("tests leads to new mlocked page: old[%d], new[%d]\n",
+ old_locked_vm_size,
+ locked_vm_size);
+
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+int main(int argc, char **argv)
+{
+ char *p = NULL;
+
+ ksft_print_header();
+
+ if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+ ksft_finished();
+
+ ksft_set_plan(2);
+
+ p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+ if (p == NULL)
+ ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+ test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+ munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+ free(p);
+
+ p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+ if (p == NULL)
+ ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno));
+
+ test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+ munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+ free(p);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
new file mode 100644
index 000000000000..3e90ff37e336
--- /dev/null
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "../kselftest.h"
+#include "mlock2.h"
+
+struct vm_boundaries {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+ FILE *file;
+ int ret = 1;
+ char line[1024] = {0};
+ unsigned long start;
+ unsigned long end;
+
+ if (!area)
+ return ret;
+
+ file = fopen("/proc/self/maps", "r");
+ if (!file) {
+ perror("fopen");
+ return ret;
+ }
+
+ memset(area, 0, sizeof(struct vm_boundaries));
+
+ while(fgets(line, 1024, file)) {
+ if (sscanf(line, "%lx-%lx", &start, &end) != 2) {
+ ksft_print_msg("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+
+ if (start <= addr && end > addr) {
+ area->start = start;
+ area->end = end;
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ fclose(file);
+ return ret;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+ char *line = NULL;
+ char *flags;
+ size_t size = 0;
+ bool ret = false;
+ FILE *smaps;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ ksft_print_msg("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, VMFLAGS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ flags = line + strlen(VMFLAGS);
+ ret = (strstr(flags, vmflag) != NULL);
+ goto out;
+ }
+
+out:
+ free(line);
+ fclose(smaps);
+ return ret;
+}
+
+#define SIZE "Size:"
+#define RSS "Rss:"
+#define LOCKED "lo"
+
+static unsigned long get_value_for_name(unsigned long addr, const char *name)
+{
+ char *line = NULL;
+ size_t size = 0;
+ char *value_ptr;
+ FILE *smaps = NULL;
+ unsigned long value = -1UL;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ ksft_print_msg("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, name)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ value_ptr = line + strlen(name);
+ if (sscanf(value_ptr, "%lu kB", &value) < 1) {
+ ksft_print_msg("Unable to parse smaps entry for Size\n");
+ goto out;
+ }
+ break;
+ }
+
+out:
+ if (smaps)
+ fclose(smaps);
+ free(line);
+ return value;
+}
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+ bool locked;
+ unsigned long vma_size, vma_rss;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ return false;
+
+ vma_size = get_value_for_name(addr, SIZE);
+ vma_rss = get_value_for_name(addr, RSS);
+
+ /* only one page is faulted in */
+ return (vma_rss < vma_size);
+}
+
+#define PRESENT_BIT 0x8000000000000000ULL
+#define PFN_MASK 0x007FFFFFFFFFFFFFULL
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(unsigned long addr)
+{
+ bool locked;
+ unsigned long vma_size, vma_rss;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ return false;
+
+ vma_size = get_value_for_name(addr, SIZE);
+ vma_rss = get_value_for_name(addr, RSS);
+
+ return (vma_rss == vma_size);
+}
+
+static int unlock_lock_check(char *map)
+{
+ if (is_vmflag_set((unsigned long)map, LOCKED)) {
+ ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_mlock_lock(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ if (mlock2_(map, 2 * page_size, 0)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlock2(0): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__);
+ munmap(map, 2 * page_size);
+}
+
+static int onfault_check(char *map)
+{
+ *map = 'a';
+ if (!is_vma_lock_on_fault((unsigned long)map)) {
+ ksft_print_msg("VMA is not marked for lock on fault\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+
+ if (is_vma_lock_on_fault((unsigned long)map) ||
+ is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ ksft_print_msg("VMA is still lock on fault after unlock\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_mlock_onfault(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlock2(MLOCK_ONFAULT): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!unlock_onfault_check(map), "VMA open lock after fault\n");
+ munmap(map, 2 * page_size);
+}
+
+static void test_lock_onfault_of_present(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ *map = 'a';
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ munmap(map, 2 * page_size);
+ ksft_test_result_fail("mlock2(MLOCK_ONFAULT) error: %s", strerror(errno));
+ }
+
+ ksft_test_result(is_vma_lock_on_fault((unsigned long)map) ||
+ is_vma_lock_on_fault((unsigned long)map + page_size),
+ "VMA with present pages is not marked lock on fault\n");
+ munmap(map, 2 * page_size);
+}
+
+static void test_munlockall0(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s\n", strerror(errno));
+
+ if (mlockall(MCL_CURRENT)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlockall(MCL_CURRENT): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(lock_check((unsigned long)map), "%s: Locked memory area\n", __func__);
+
+ if (munlockall()) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
+ munmap(map, 2 * page_size);
+}
+
+static void test_munlockall1(void)
+{
+ char *map;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_ONFAULT): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__);
+
+ if (munlockall()) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!unlock_onfault_check(map), "%s: Unlocked\n", __func__);
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno));
+ }
+
+ ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__);
+
+ if (munlockall()) {
+ munmap(map, 2 * page_size);
+ ksft_exit_fail_msg("munlockall() %s\n", strerror(errno));
+ }
+
+ ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__);
+ munmap(map, 2 * page_size);
+}
+
+static void test_vma_management(bool call_mlock)
+{
+ void *map;
+ unsigned long page_size = getpagesize();
+ struct vm_boundaries page1;
+ struct vm_boundaries page2;
+ struct vm_boundaries page3;
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("mlock error: %s", strerror(errno));
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
+ }
+
+ /*
+ * Before we unlock a portion, we need to that all three pages are in
+ * the same VMA. If they are not we abort this test (Note that this is
+ * not a failure)
+ */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("VMAs are not merged to start, aborting test");
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("munlock(): %s", strerror(errno));
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
+ }
+
+ /* All three VMAs should be different */
+ if (page1.start == page2.start || page2.start == page3.start) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("failed to split VMA for munlock");
+ }
+
+ /* Now unlock the first and third page and check the VMAs again */
+ if (munlock(map, page_size * 3)) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("munlock(): %s", strerror(errno));
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("couldn't find mapping in /proc/self/maps");
+ }
+
+ /* Now all three VMAs should be the same */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ munmap(map, 3 * page_size);
+ ksft_test_result_fail("failed to merge VMAs after munlock");
+ }
+
+ ksft_test_result_pass("%s call_mlock %d\n", __func__, call_mlock);
+ munmap(map, 3 * page_size);
+}
+
+static void test_mlockall(void)
+{
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE))
+ ksft_exit_fail_msg("mlockall failed: %s\n", strerror(errno));
+
+ test_vma_management(false);
+ munlockall();
+}
+
+int main(int argc, char **argv)
+{
+ int ret, size = 3 * getpagesize();
+ void *map;
+
+ ksft_print_header();
+
+ map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap error: %s", strerror(errno));
+
+ ret = mlock2_(map, size, MLOCK_ONFAULT);
+ if (ret && errno == ENOSYS)
+ ksft_finished();
+
+ munmap(map, size);
+
+ ksft_set_plan(13);
+
+ test_mlock_lock();
+ test_mlock_onfault();
+ test_munlockall0();
+ test_munlockall1();
+ test_lock_onfault_of_present();
+ test_vma_management(true);
+ test_mlockall();
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
new file mode 100644
index 000000000000..81e77fa41901
--- /dev/null
+++ b/tools/testing/selftests/mm/mlock2.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+ int ret = syscall(__NR_mlock2, start, len, flags);
+
+ if (ret) {
+ errno = ret;
+ return -1;
+ }
+ return 0;
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file)
+ ksft_exit_fail_msg("fopen smaps: %s\n", strerror(errno));
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end)
+ goto out;
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+ fclose(file);
+ file = NULL;
+
+out:
+ free(line);
+ return file;
+}
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
new file mode 100644
index 000000000000..100370a7111d
--- /dev/null
+++ b/tools/testing/selftests/mm/mrelease_test.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2022 Google LLC
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <asm-generic/unistd.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#define MB(x) (x << 20)
+#define MAX_SIZE_MB 1024
+
+static int alloc_noexit(unsigned long nr_pages, int pipefd)
+{
+ int ppid = getppid();
+ int timeout = 10; /* 10sec timeout to get killed */
+ unsigned long i;
+ char *buf;
+
+ buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, 0, 0);
+ if (buf == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed, halting the test: %s\n", strerror(errno));
+
+ for (i = 0; i < nr_pages; i++)
+ *((unsigned long *)(buf + (i * psize()))) = i;
+
+ /* Signal the parent that the child is ready */
+ if (write(pipefd, "", 1) < 0)
+ ksft_exit_fail_msg("write: %s\n", strerror(errno));
+
+ /* Wait to be killed (when reparenting happens) */
+ while (getppid() == ppid && timeout > 0) {
+ sleep(1);
+ timeout--;
+ }
+
+ munmap(buf, nr_pages * psize());
+
+ return (timeout > 0) ? KSFT_PASS : KSFT_FAIL;
+}
+
+/* The process_mrelease calls in this test are expected to fail */
+static void run_negative_tests(int pidfd)
+{
+ /* Test invalid flags. Expect to fail with EINVAL error code. */
+ if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
+ errno != EINVAL) {
+ ksft_exit_fail_msg("process_mrelease with wrong flags: %s\n", strerror(errno));
+ }
+ /*
+ * Test reaping while process is alive with no pending SIGKILL.
+ * Expect to fail with EINVAL error code.
+ */
+ if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL)
+ ksft_exit_fail_msg("process_mrelease on a live process: %s\n", strerror(errno));
+}
+
+static int child_main(int pipefd[], size_t size)
+{
+ int res;
+
+ /* Allocate and fault-in memory and wait to be killed */
+ close(pipefd[0]);
+ res = alloc_noexit(MB(size) / psize(), pipefd[1]);
+ close(pipefd[1]);
+ return res;
+}
+
+int main(void)
+{
+ int pipefd[2], pidfd;
+ bool success, retry;
+ size_t size;
+ pid_t pid;
+ char byte;
+ int res;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ /* Test a wrong pidfd */
+ if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
+ if (errno == ENOSYS) {
+ ksft_test_result_skip("process_mrelease not implemented\n");
+ ksft_finished();
+ } else {
+ ksft_exit_fail_msg("process_mrelease with wrong pidfd: %s",
+ strerror(errno));
+ }
+ }
+
+ /* Start the test with 1MB child memory allocation */
+ size = 1;
+retry:
+ /*
+ * Pipe for the child to signal when it's done allocating
+ * memory
+ */
+ if (pipe(pipefd))
+ ksft_exit_fail_msg("pipe: %s\n", strerror(errno));
+
+ pid = fork();
+ if (pid < 0) {
+ close(pipefd[0]);
+ close(pipefd[1]);
+ ksft_exit_fail_msg("fork: %s\n", strerror(errno));
+ }
+
+ if (pid == 0) {
+ /* Child main routine */
+ res = child_main(pipefd, size);
+ exit(res);
+ }
+
+ /*
+ * Parent main routine:
+ * Wait for the child to finish allocations, then kill and reap
+ */
+ close(pipefd[1]);
+ /* Block until the child is ready */
+ res = read(pipefd[0], &byte, 1);
+ close(pipefd[0]);
+ if (res < 0) {
+ if (!kill(pid, SIGKILL))
+ waitpid(pid, NULL, 0);
+ ksft_exit_fail_msg("read: %s\n", strerror(errno));
+ }
+
+ pidfd = syscall(__NR_pidfd_open, pid, 0);
+ if (pidfd < 0) {
+ if (!kill(pid, SIGKILL))
+ waitpid(pid, NULL, 0);
+ ksft_exit_fail_msg("pidfd_open: %s\n", strerror(errno));
+ }
+
+ /* Run negative tests which require a live child */
+ run_negative_tests(pidfd);
+
+ if (kill(pid, SIGKILL))
+ ksft_exit_fail_msg("kill: %s\n", strerror(errno));
+
+ success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
+ if (!success) {
+ /*
+ * If we failed to reap because the child exited too soon,
+ * before we could call process_mrelease. Double child's memory
+ * which causes it to spend more time on cleanup and increases
+ * our chances of reaping its memory before it exits.
+ * Retry until we succeed or reach MAX_SIZE_MB.
+ */
+ if (errno == ESRCH) {
+ retry = (size <= MAX_SIZE_MB);
+ } else {
+ waitpid(pid, NULL, 0);
+ ksft_exit_fail_msg("process_mrelease: %s\n", strerror(errno));
+ }
+ }
+
+ /* Cleanup to prevent zombies */
+ if (waitpid(pid, NULL, 0) < 0)
+ ksft_exit_fail_msg("waitpid: %s\n", strerror(errno));
+
+ close(pidfd);
+
+ if (!success) {
+ if (retry) {
+ size *= 2;
+ goto retry;
+ }
+ ksft_exit_fail_msg("All process_mrelease attempts failed!\n");
+ }
+
+ ksft_test_result_pass("Success reaping a child with %zuMB of memory allocations\n",
+ size);
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
new file mode 100644
index 000000000000..1d75084b9ca5
--- /dev/null
+++ b/tools/testing/selftests/mm/mremap_dontunmap.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Tests for mremap w/ MREMAP_DONTUNMAP.
+ *
+ * Copyright 2020, Brian Geffon <bgeffon@google.com>
+ */
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+unsigned long page_size;
+char *page_buffer;
+
+static void dump_maps(void)
+{
+ char cmd[32];
+
+ snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+ system(cmd);
+}
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ dump_maps(); \
+ ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \
+ __func__, __LINE__, (description), \
+ strerror(errno)); \
+ } \
+ } while (0)
+
+// Try a simple operation for to "test" for kernel support this prevents
+// reporting tests as failed when it's run on an older kernel.
+static int kernel_support_for_mremap_dontunmap()
+{
+ int ret = 0;
+ unsigned long num_pages = 1;
+ void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ // This simple remap should only fail if MREMAP_DONTUNMAP isn't
+ // supported.
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
+ if (dest_mapping == MAP_FAILED) {
+ ret = errno;
+ } else {
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ }
+
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ return ret;
+}
+
+// This helper will just validate that an entire mapping contains the expected
+// byte.
+static int check_region_contains_byte(void *addr, unsigned long size, char byte)
+{
+ BUG_ON(size & (page_size - 1),
+ "check_region_contains_byte expects page multiples");
+ BUG_ON((unsigned long)addr & (page_size - 1),
+ "check_region_contains_byte expects page alignment");
+
+ memset(page_buffer, byte, page_size);
+
+ unsigned long num_pages = size / page_size;
+ unsigned long i;
+
+ // Compare each page checking that it contains our expected byte.
+ for (i = 0; i < num_pages; ++i) {
+ int ret =
+ memcmp(addr + (i * page_size), page_buffer, page_size);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
+// the source mapping mapped.
+static void mremap_dontunmap_simple()
+{
+ unsigned long num_pages = 5;
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // Try to just move the whole mapping anywhere (not fixed).
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // Validate that the pages have been moved, we know they were moved if
+ // the dest_mapping contains a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 0) != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
+static void mremap_dontunmap_simple_shmem()
+{
+ unsigned long num_pages = 5;
+
+ int mem_fd = memfd_create("memfd", MFD_CLOEXEC);
+ BUG_ON(mem_fd < 0, "memfd_create");
+
+ BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0,
+ "ftruncate");
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_FILE | MAP_SHARED, mem_fd, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ BUG_ON(close(mem_fd) < 0, "close");
+
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // Try to just move the whole mapping anywhere (not fixed).
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ if (dest_mapping == MAP_FAILED && errno == EINVAL) {
+ // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem.
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ return;
+ }
+
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // Validate that the pages have been moved, we know they were moved if
+ // the dest_mapping contains a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+
+ // Because the region is backed by shmem, we will actually see the same
+ // memory at the source location still.
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 'a') != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+// This test validates MREMAP_DONTUNMAP will move page tables to a specific
+// destination using MREMAP_FIXED, also while validating that the source
+// remains intact.
+static void mremap_dontunmap_simple_fixed()
+{
+ unsigned long num_pages = 5;
+
+ // Since we want to guarantee that we can remap to a point, we will
+ // create a mapping up front.
+ void *dest_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+ memset(dest_mapping, 'X', num_pages * page_size);
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ void *remapped_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+ dest_mapping);
+ BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+ BUG_ON(remapped_mapping != dest_mapping,
+ "mremap should have placed the remapped mapping at dest_mapping");
+
+ // The dest mapping will have been unmap by mremap so we expect the Xs
+ // to be gone and replaced with a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+
+ // And the source mapping will have had its ptes dropped.
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 0) != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+// This test validates that we can MREMAP_DONTUNMAP for a portion of an
+// existing mapping.
+static void mremap_dontunmap_partial_mapping()
+{
+ /*
+ * source mapping:
+ * --------------
+ * | aaaaaaaaaa |
+ * --------------
+ * to become:
+ * --------------
+ * | aaaaa00000 |
+ * --------------
+ * With the destination mapping containing 5 pages of As.
+ * ---------
+ * | aaaaa |
+ * ---------
+ */
+ unsigned long num_pages = 10;
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // We will grab the last 5 pages of the source and move them.
+ void *dest_mapping =
+ mremap(source_mapping + (5 * page_size), 5 * page_size,
+ 5 * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // We expect the first 5 pages of the source to contain a's and the
+ // final 5 pages to contain zeros.
+ BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
+ 0, "first 5 pages of source should have original pages");
+ BUG_ON(check_region_contains_byte
+ (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
+ "final 5 pages of source should have no ptes");
+
+ // Finally we expect the destination to have 5 pages worth of a's.
+ BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+ 0, "dest mapping should contain ptes from the source");
+
+ BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+// This test validates that we can remap over only a portion of a mapping.
+static void mremap_dontunmap_partial_mapping_overwrite(void)
+{
+ /*
+ * source mapping:
+ * ---------
+ * |aaaaa|
+ * ---------
+ * dest mapping initially:
+ * -----------
+ * |XXXXXXXXXX|
+ * ------------
+ * Source to become:
+ * ---------
+ * |00000|
+ * ---------
+ * With the destination mapping containing 5 pages of As.
+ * ------------
+ * |aaaaaXXXXX|
+ * ------------
+ */
+ void *source_mapping =
+ mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', 5 * page_size);
+
+ void *dest_mapping =
+ mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+ memset(dest_mapping, 'X', 10 * page_size);
+
+ // We will grab the last 5 pages of the source and move them.
+ void *remapped_mapping =
+ mremap(source_mapping, 5 * page_size,
+ 5 * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+ BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
+
+ BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
+ 0, "first 5 pages of source should have no ptes");
+
+ // Finally we expect the destination to have 5 pages worth of a's.
+ BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
+ "dest mapping should contain ptes from the source");
+
+ // Finally the last 5 pages shouldn't have been touched.
+ BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
+ 5 * page_size, 'X') != 0,
+ "dest mapping should have retained the last 5 pages");
+
+ BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
+ "unable to unmap source mapping");
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+int main(void)
+{
+ ksft_print_header();
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ // test for kernel support for MREMAP_DONTUNMAP skipping the test if
+ // not.
+ if (kernel_support_for_mremap_dontunmap() != 0) {
+ ksft_print_msg("No kernel support for MREMAP_DONTUNMAP\n");
+ ksft_finished();
+ }
+
+ ksft_set_plan(5);
+
+ // Keep a page sized buffer around for when we need it.
+ page_buffer =
+ mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
+
+ mremap_dontunmap_simple();
+ mremap_dontunmap_simple_shmem();
+ mremap_dontunmap_simple_fixed();
+ mremap_dontunmap_partial_mapping();
+ mremap_dontunmap_partial_mapping_overwrite();
+
+ BUG_ON(munmap(page_buffer, page_size) == -1,
+ "unable to unmap page buffer");
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
new file mode 100644
index 000000000000..bb84476a177f
--- /dev/null
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Google LLC
+ */
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#define EXPECT_SUCCESS 0
+#define EXPECT_FAILURE 1
+#define NON_OVERLAPPING 0
+#define OVERLAPPING 1
+#define NS_PER_SEC 1000000000ULL
+#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */
+#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */
+
+#ifndef MIN
+#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+#endif
+#define SIZE_MB(m) ((size_t)m * (1024 * 1024))
+#define SIZE_KB(k) ((size_t)k * 1024)
+
+struct config {
+ unsigned long long src_alignment;
+ unsigned long long dest_alignment;
+ unsigned long long region_size;
+ int overlapping;
+ unsigned int dest_preamble_size;
+};
+
+struct test {
+ const char *name;
+ struct config config;
+ int expect_failure;
+};
+
+enum {
+ _1KB = 1ULL << 10, /* 1KB -> not page aligned */
+ _4KB = 4ULL << 10,
+ _8KB = 8ULL << 10,
+ _1MB = 1ULL << 20,
+ _2MB = 2ULL << 20,
+ _4MB = 4ULL << 20,
+ _5MB = 5ULL << 20,
+ _1GB = 1ULL << 30,
+ _2GB = 2ULL << 30,
+ PMD = _2MB,
+ PUD = _1GB,
+};
+
+#define PTE page_size
+
+#define MAKE_TEST(source_align, destination_align, size, \
+ overlaps, should_fail, test_name) \
+(struct test){ \
+ .name = test_name, \
+ .config = { \
+ .src_alignment = source_align, \
+ .dest_alignment = destination_align, \
+ .region_size = size, \
+ .overlapping = overlaps, \
+ }, \
+ .expect_failure = should_fail \
+}
+
+/* compute square root using binary search */
+static unsigned long get_sqrt(unsigned long val)
+{
+ unsigned long low = 1;
+
+ /* assuming rand_size is less than 1TB */
+ unsigned long high = (1UL << 20);
+
+ while (low <= high) {
+ unsigned long mid = low + (high - low) / 2;
+ unsigned long temp = mid * mid;
+
+ if (temp == val)
+ return mid;
+ if (temp < val)
+ low = mid + 1;
+ high = mid - 1;
+ }
+ return low;
+}
+
+/*
+ * Returns false if the requested remap region overlaps with an
+ * existing mapping (e.g text, stack) else returns true.
+ */
+static bool is_remap_region_valid(void *addr, unsigned long long size)
+{
+ void *remap_addr = NULL;
+ bool ret = true;
+
+ /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
+ remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
+ MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+ -1, 0);
+
+ if (remap_addr == MAP_FAILED) {
+ if (errno == EEXIST)
+ ret = false;
+ } else {
+ munmap(remap_addr, size);
+ }
+
+ return ret;
+}
+
+/* Returns mmap_min_addr sysctl tunable from procfs */
+static unsigned long long get_mmap_min_addr(void)
+{
+ FILE *fp;
+ int n_matched;
+ static unsigned long long addr;
+
+ if (addr)
+ return addr;
+
+ fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
+ if (fp == NULL) {
+ ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
+ strerror(errno));
+ exit(KSFT_SKIP);
+ }
+
+ n_matched = fscanf(fp, "%llu", &addr);
+ if (n_matched != 1) {
+ ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
+ strerror(errno));
+ fclose(fp);
+ exit(KSFT_SKIP);
+ }
+
+ fclose(fp);
+ return addr;
+}
+
+/*
+ * Using /proc/self/maps, assert that the specified address range is contained
+ * within a single mapping.
+ */
+static bool is_range_mapped(FILE *maps_fp, unsigned long start,
+ unsigned long end)
+{
+ char *line = NULL;
+ size_t len = 0;
+ bool success = false;
+ unsigned long first_val, second_val;
+
+ rewind(maps_fp);
+
+ while (getline(&line, &len, maps_fp) != -1) {
+ if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) {
+ ksft_exit_fail_msg("cannot parse /proc/self/maps\n");
+ break;
+ }
+
+ if (first_val <= start && second_val >= end) {
+ success = true;
+ break;
+ }
+ }
+
+ return success;
+}
+
+/*
+ * Returns the start address of the mapping on success, else returns
+ * NULL on failure.
+ */
+static void *get_source_mapping(struct config c)
+{
+ unsigned long long addr = 0ULL;
+ void *src_addr = NULL;
+ unsigned long long mmap_min_addr;
+
+ mmap_min_addr = get_mmap_min_addr();
+ /*
+ * For some tests, we need to not have any mappings below the
+ * source mapping. Add some headroom to mmap_min_addr for this.
+ */
+ mmap_min_addr += 10 * _4MB;
+
+retry:
+ addr += c.src_alignment;
+ if (addr < mmap_min_addr)
+ goto retry;
+
+ src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+ -1, 0);
+ if (src_addr == MAP_FAILED) {
+ if (errno == EPERM || errno == EEXIST)
+ goto retry;
+ goto error;
+ }
+ /*
+ * Check that the address is aligned to the specified alignment.
+ * Addresses which have alignments that are multiples of that
+ * specified are not considered valid. For instance, 1GB address is
+ * 2MB-aligned, however it will not be considered valid for a
+ * requested alignment of 2MB. This is done to reduce coincidental
+ * alignment in the tests.
+ */
+ if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
+ !((unsigned long long) src_addr & c.src_alignment)) {
+ munmap(src_addr, c.region_size);
+ goto retry;
+ }
+
+ if (!src_addr)
+ goto error;
+
+ return src_addr;
+error:
+ ksft_print_msg("Failed to map source region: %s\n",
+ strerror(errno));
+ return NULL;
+}
+
+/*
+ * This test validates that merge is called when expanding a mapping.
+ * Mapping containing three pages is created, middle page is unmapped
+ * and then the mapping containing the first page is expanded so that
+ * it fills the created hole. The two parts should merge creating
+ * single mapping with three pages.
+ */
+static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size)
+{
+ char *test_name = "mremap expand merge";
+ bool success = false;
+ char *remap, *start;
+
+ start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (start == MAP_FAILED) {
+ ksft_print_msg("mmap failed: %s\n", strerror(errno));
+ goto out;
+ }
+
+ munmap(start + page_size, page_size);
+ remap = mremap(start, page_size, 2 * page_size, 0);
+ if (remap == MAP_FAILED) {
+ ksft_print_msg("mremap failed: %s\n", strerror(errno));
+ munmap(start, page_size);
+ munmap(start + 2 * page_size, page_size);
+ goto out;
+ }
+
+ success = is_range_mapped(maps_fp, (unsigned long)start,
+ (unsigned long)(start + 3 * page_size));
+ munmap(start, 3 * page_size);
+
+out:
+ if (success)
+ ksft_test_result_pass("%s\n", test_name);
+ else
+ ksft_test_result_fail("%s\n", test_name);
+}
+
+/*
+ * Similar to mremap_expand_merge() except instead of removing the middle page,
+ * we remove the last then attempt to remap offset from the second page. This
+ * should result in the mapping being restored to its former state.
+ */
+static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size)
+{
+
+ char *test_name = "mremap expand merge offset";
+ bool success = false;
+ char *remap, *start;
+
+ start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (start == MAP_FAILED) {
+ ksft_print_msg("mmap failed: %s\n", strerror(errno));
+ goto out;
+ }
+
+ /* Unmap final page to ensure we have space to expand. */
+ munmap(start + 2 * page_size, page_size);
+ remap = mremap(start + page_size, page_size, 2 * page_size, 0);
+ if (remap == MAP_FAILED) {
+ ksft_print_msg("mremap failed: %s\n", strerror(errno));
+ munmap(start, 2 * page_size);
+ goto out;
+ }
+
+ success = is_range_mapped(maps_fp, (unsigned long)start,
+ (unsigned long)(start + 3 * page_size));
+ munmap(start, 3 * page_size);
+
+out:
+ if (success)
+ ksft_test_result_pass("%s\n", test_name);
+ else
+ ksft_test_result_fail("%s\n", test_name);
+}
+
+/*
+ * Verify that an mremap within a range does not cause corruption
+ * of unrelated part of range.
+ *
+ * Consider the following range which is 2MB aligned and is
+ * a part of a larger 20MB range which is not shown. Each
+ * character is 256KB below making the source and destination
+ * 2MB each. The lower case letters are moved (s to d) and the
+ * upper case letters are not moved. The below test verifies
+ * that the upper case S letters are not corrupted by the
+ * adjacent mremap.
+ *
+ * |DDDDddddSSSSssss|
+ */
+static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr)
+{
+ char *test_name = "mremap mremap move within range";
+ void *src, *dest;
+ unsigned int i, success = 1;
+
+ size_t size = SIZE_MB(20);
+ void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (ptr == MAP_FAILED) {
+ perror("mmap");
+ success = 0;
+ goto out;
+ }
+ memset(ptr, 0, size);
+
+ src = ptr + SIZE_MB(6);
+ src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1));
+
+ /* Set byte pattern for source block. */
+ memcpy(src, rand_addr, SIZE_MB(2));
+
+ dest = src - SIZE_MB(2);
+
+ void *new_ptr = mremap(src + SIZE_MB(1), SIZE_MB(1), SIZE_MB(1),
+ MREMAP_MAYMOVE | MREMAP_FIXED, dest + SIZE_MB(1));
+ if (new_ptr == MAP_FAILED) {
+ perror("mremap");
+ success = 0;
+ goto out;
+ }
+
+ /* Verify byte pattern after remapping */
+ srand(pattern_seed);
+ for (i = 0; i < SIZE_MB(1); i++) {
+ char c = (char) rand();
+
+ if (((char *)src)[i] != c) {
+ ksft_print_msg("Data at src at %d got corrupted due to unrelated mremap\n",
+ i);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
+ ((char *) src)[i] & 0xff);
+ success = 0;
+ }
+ }
+
+out:
+ if (munmap(ptr, size) == -1)
+ perror("munmap");
+
+ if (success)
+ ksft_test_result_pass("%s\n", test_name);
+ else
+ ksft_test_result_fail("%s\n", test_name);
+}
+
+/* Returns the time taken for the remap on success else returns -1. */
+static long long remap_region(struct config c, unsigned int threshold_mb,
+ char *rand_addr)
+{
+ void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL;
+ unsigned long long t, d;
+ struct timespec t_start = {0, 0}, t_end = {0, 0};
+ long long start_ns, end_ns, align_mask, ret, offset;
+ unsigned long long threshold;
+ unsigned long num_chunks;
+
+ if (threshold_mb == VALIDATION_NO_THRESHOLD)
+ threshold = c.region_size;
+ else
+ threshold = MIN(threshold_mb * _1MB, c.region_size);
+
+ src_addr = get_source_mapping(c);
+ if (!src_addr) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Set byte pattern for source block. */
+ memcpy(src_addr, rand_addr, threshold);
+
+ /* Mask to zero out lower bits of address for alignment */
+ align_mask = ~(c.dest_alignment - 1);
+ /* Offset of destination address from the end of the source region */
+ offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment;
+ addr = (void *) (((unsigned long long) src_addr + c.region_size
+ + offset) & align_mask);
+
+ /* Remap after the destination block preamble. */
+ addr += c.dest_preamble_size;
+
+ /* See comment in get_source_mapping() */
+ if (!((unsigned long long) addr & c.dest_alignment))
+ addr = (void *) ((unsigned long long) addr | c.dest_alignment);
+
+ /* Don't destroy existing mappings unless expected to overlap */
+ while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
+ /* Check for unsigned overflow */
+ if (addr + c.dest_alignment < addr) {
+ ksft_print_msg("Couldn't find a valid region to remap to\n");
+ ret = -1;
+ goto clean_up_src;
+ }
+ addr += c.dest_alignment;
+ }
+
+ if (c.dest_preamble_size) {
+ dest_preamble_addr = mmap((void *) addr - c.dest_preamble_size, c.dest_preamble_size,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+ -1, 0);
+ if (dest_preamble_addr == MAP_FAILED) {
+ ksft_print_msg("Failed to map dest preamble region: %s\n",
+ strerror(errno));
+ ret = -1;
+ goto clean_up_src;
+ }
+
+ /* Set byte pattern for the dest preamble block. */
+ memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size);
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t_start);
+ dest_addr = mremap(src_addr, c.region_size, c.region_size,
+ MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
+ clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+ if (dest_addr == MAP_FAILED) {
+ ksft_print_msg("mremap failed: %s\n", strerror(errno));
+ ret = -1;
+ goto clean_up_dest_preamble;
+ }
+
+ /*
+ * Verify byte pattern after remapping. Employ an algorithm with a
+ * square root time complexity in threshold: divide the range into
+ * chunks, if memcmp() returns non-zero, only then perform an
+ * iteration in that chunk to find the mismatch index.
+ */
+ num_chunks = get_sqrt(threshold);
+ for (unsigned long i = 0; i < num_chunks; ++i) {
+ size_t chunk_size = threshold / num_chunks;
+ unsigned long shift = i * chunk_size;
+
+ if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size))
+ continue;
+
+ /* brute force iteration only over mismatch segment */
+ for (t = shift; t < shift + chunk_size; ++t) {
+ if (((char *) dest_addr)[t] != rand_addr[t]) {
+ ksft_print_msg("Data after remap doesn't match at offset %llu\n",
+ t);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff,
+ ((char *) dest_addr)[t] & 0xff);
+ ret = -1;
+ goto clean_up_dest;
+ }
+ }
+ }
+
+ /*
+ * if threshold is not divisible by num_chunks, then check the
+ * last chunk
+ */
+ for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) {
+ if (((char *) dest_addr)[t] != rand_addr[t]) {
+ ksft_print_msg("Data after remap doesn't match at offset %llu\n",
+ t);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff,
+ ((char *) dest_addr)[t] & 0xff);
+ ret = -1;
+ goto clean_up_dest;
+ }
+ }
+
+ /* Verify the dest preamble byte pattern after remapping */
+ if (!c.dest_preamble_size)
+ goto no_preamble;
+
+ num_chunks = get_sqrt(c.dest_preamble_size);
+
+ for (unsigned long i = 0; i < num_chunks; ++i) {
+ size_t chunk_size = c.dest_preamble_size / num_chunks;
+ unsigned long shift = i * chunk_size;
+
+ if (!memcmp(dest_preamble_addr + shift, rand_addr + shift,
+ chunk_size))
+ continue;
+
+ /* brute force iteration only over mismatched segment */
+ for (d = shift; d < shift + chunk_size; ++d) {
+ if (((char *) dest_preamble_addr)[d] != rand_addr[d]) {
+ ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n",
+ d);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff,
+ ((char *) dest_preamble_addr)[d] & 0xff);
+ ret = -1;
+ goto clean_up_dest;
+ }
+ }
+ }
+
+ for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) {
+ if (((char *) dest_preamble_addr)[d] != rand_addr[d]) {
+ ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n",
+ d);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff,
+ ((char *) dest_preamble_addr)[d] & 0xff);
+ ret = -1;
+ goto clean_up_dest;
+ }
+ }
+
+no_preamble:
+ start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
+ end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
+ ret = end_ns - start_ns;
+
+/*
+ * Since the destination address is specified using MREMAP_FIXED, subsequent
+ * mremap will unmap any previous mapping at the address range specified by
+ * dest_addr and region_size. This significantly affects the remap time of
+ * subsequent tests. So we clean up mappings after each test.
+ */
+clean_up_dest:
+ munmap(dest_addr, c.region_size);
+clean_up_dest_preamble:
+ if (c.dest_preamble_size && dest_preamble_addr)
+ munmap(dest_preamble_addr, c.dest_preamble_size);
+clean_up_src:
+ munmap(src_addr, c.region_size);
+out:
+ return ret;
+}
+
+/*
+ * Verify that an mremap aligning down does not destroy
+ * the beginning of the mapping just because the aligned
+ * down address landed on a mapping that maybe does not exist.
+ */
+static void mremap_move_1mb_from_start(unsigned int pattern_seed,
+ char *rand_addr)
+{
+ char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src";
+ void *src = NULL, *dest = NULL;
+ unsigned int i, success = 1;
+
+ /* Config to reuse get_source_mapping() to do an aligned mmap. */
+ struct config c = {
+ .src_alignment = SIZE_MB(1) + SIZE_KB(256),
+ .region_size = SIZE_MB(6)
+ };
+
+ src = get_source_mapping(c);
+ if (!src) {
+ success = 0;
+ goto out;
+ }
+
+ c.src_alignment = SIZE_MB(1) + SIZE_KB(256);
+ dest = get_source_mapping(c);
+ if (!dest) {
+ success = 0;
+ goto out;
+ }
+
+ /* Set byte pattern for source block. */
+ memcpy(src, rand_addr, SIZE_MB(2));
+
+ /*
+ * Unmap the beginning of dest so that the aligned address
+ * falls on no mapping.
+ */
+ munmap(dest, SIZE_MB(1));
+
+ void *new_ptr = mremap(src + SIZE_MB(1), SIZE_MB(1), SIZE_MB(1),
+ MREMAP_MAYMOVE | MREMAP_FIXED, dest + SIZE_MB(1));
+ if (new_ptr == MAP_FAILED) {
+ perror("mremap");
+ success = 0;
+ goto out;
+ }
+
+ /* Verify byte pattern after remapping */
+ srand(pattern_seed);
+ for (i = 0; i < SIZE_MB(1); i++) {
+ char c = (char) rand();
+
+ if (((char *)src)[i] != c) {
+ ksft_print_msg("Data at src at %d got corrupted due to unrelated mremap\n",
+ i);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
+ ((char *) src)[i] & 0xff);
+ success = 0;
+ }
+ }
+
+out:
+ if (src && munmap(src, c.region_size) == -1)
+ perror("munmap src");
+
+ if (dest && munmap(dest, c.region_size) == -1)
+ perror("munmap dest");
+
+ if (success)
+ ksft_test_result_pass("%s\n", test_name);
+ else
+ ksft_test_result_fail("%s\n", test_name);
+}
+
+static void run_mremap_test_case(struct test test_case, int *failures,
+ unsigned int threshold_mb,
+ char *rand_addr)
+{
+ long long remap_time = remap_region(test_case.config, threshold_mb,
+ rand_addr);
+
+ if (remap_time < 0) {
+ if (test_case.expect_failure)
+ ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
+ test_case.name);
+ else {
+ ksft_test_result_fail("%s\n", test_case.name);
+ *failures += 1;
+ }
+ } else {
+ /*
+ * Comparing mremap time is only applicable if entire region
+ * was faulted in.
+ */
+ if (threshold_mb == VALIDATION_NO_THRESHOLD ||
+ test_case.config.region_size <= threshold_mb * _1MB)
+ ksft_test_result_pass("%s\n\tmremap time: %12lldns\n",
+ test_case.name, remap_time);
+ else
+ ksft_test_result_pass("%s\n", test_case.name);
+ }
+}
+
+static void usage(const char *cmd)
+{
+ fprintf(stderr,
+ "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n"
+ "-t\t only validate threshold_mb of the remapped region\n"
+ " \t if 0 is supplied no threshold is used; all tests\n"
+ " \t are run and remapped regions validated fully.\n"
+ " \t The default threshold used is 4MB.\n"
+ "-p\t provide a seed to generate the random pattern for\n"
+ " \t validating the remapped region.\n", cmd);
+}
+
+static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
+ unsigned int *pattern_seed)
+{
+ const char *optstr = "t:p:";
+ int opt;
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ case 't':
+ *threshold_mb = atoi(optarg);
+ break;
+ case 'p':
+ *pattern_seed = atoi(optarg);
+ break;
+ default:
+ usage(argv[0]);
+ return -1;
+ }
+ }
+
+ if (optind < argc) {
+ usage(argv[0]);
+ return -1;
+ }
+
+ return 0;
+}
+
+#define MAX_TEST 15
+#define MAX_PERF_TEST 3
+int main(int argc, char **argv)
+{
+ int failures = 0;
+ unsigned int i;
+ int run_perf_tests;
+ unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
+
+ /* hard-coded test configs */
+ size_t max_test_variable_region_size = _2GB;
+ size_t max_test_constant_region_size = _2MB;
+ size_t dest_preamble_size = 10 * _4MB;
+
+ unsigned int pattern_seed;
+ char *rand_addr;
+ size_t rand_size;
+ int num_expand_tests = 2;
+ int num_misc_tests = 2;
+ struct test test_cases[MAX_TEST] = {};
+ struct test perf_test_cases[MAX_PERF_TEST];
+ int page_size;
+ time_t t;
+ FILE *maps_fp;
+
+ pattern_seed = (unsigned int) time(&t);
+
+ if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0)
+ exit(EXIT_FAILURE);
+
+ ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
+ threshold_mb, pattern_seed);
+
+ /*
+ * set preallocated random array according to test configs; see the
+ * functions for the logic of setting the size
+ */
+ if (!threshold_mb)
+ rand_size = MAX(max_test_variable_region_size,
+ max_test_constant_region_size);
+ else
+ rand_size = MAX(MIN(threshold_mb * _1MB,
+ max_test_variable_region_size),
+ max_test_constant_region_size);
+ rand_size = MAX(dest_preamble_size, rand_size);
+
+ rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (rand_addr == MAP_FAILED) {
+ perror("mmap");
+ ksft_exit_fail_msg("cannot mmap rand_addr\n");
+ }
+
+ /* fill stream of random bytes */
+ srand(pattern_seed);
+ for (unsigned long i = 0; i < rand_size; ++i)
+ rand_addr[i] = (char) rand();
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ /* Expected mremap failures */
+ test_cases[0] = MAKE_TEST(page_size, page_size, page_size,
+ OVERLAPPING, EXPECT_FAILURE,
+ "mremap - Source and Destination Regions Overlapping");
+
+ test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
+ NON_OVERLAPPING, EXPECT_FAILURE,
+ "mremap - Destination Address Misaligned (1KB-aligned)");
+ test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
+ NON_OVERLAPPING, EXPECT_FAILURE,
+ "mremap - Source Address Misaligned (1KB-aligned)");
+
+ /* Src addr PTE aligned */
+ test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
+ NON_OVERLAPPING, EXPECT_SUCCESS,
+ "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
+
+ /* Src addr 1MB aligned */
+ test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
+ test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+ /* Src addr PMD aligned */
+ test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
+ test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
+ test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
+
+ /* Src addr PUD aligned */
+ test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
+ test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
+ test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
+ test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+ /* Src and Dest addr 1MB aligned. 5MB mremap. */
+ test_cases[13] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "5MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+ /* Src and Dest addr 1MB aligned. 5MB mremap. */
+ test_cases[14] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "5MB mremap - Source 1MB-aligned, Dest 1MB-aligned with 40MB Preamble");
+ test_cases[14].config.dest_preamble_size = 10 * _4MB;
+
+ perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "1GB mremap - Source PTE-aligned, Destination PTE-aligned");
+ /*
+ * mremap 1GB region - Page table level aligned time
+ * comparison.
+ */
+ perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
+ perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+ "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+ run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) ||
+ (threshold_mb * _1MB >= _1GB);
+
+ ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ?
+ ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests + num_misc_tests);
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+ run_mremap_test_case(test_cases[i], &failures, threshold_mb,
+ rand_addr);
+
+ maps_fp = fopen("/proc/self/maps", "r");
+
+ if (maps_fp == NULL) {
+ munmap(rand_addr, rand_size);
+ ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno));
+ }
+
+ mremap_expand_merge(maps_fp, page_size);
+ mremap_expand_merge_offset(maps_fp, page_size);
+
+ fclose(maps_fp);
+
+ mremap_move_within_range(pattern_seed, rand_addr);
+ mremap_move_1mb_from_start(pattern_seed, rand_addr);
+
+ if (run_perf_tests) {
+ ksft_print_msg("\n%s\n",
+ "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
+ for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
+ run_mremap_test_case(perf_test_cases[i], &failures,
+ threshold_mb,
+ rand_addr);
+ }
+
+ munmap(rand_addr, rand_size);
+
+ if (failures > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/mseal_helpers.h b/tools/testing/selftests/mm/mseal_helpers.h
new file mode 100644
index 000000000000..0cfce31c76d2
--- /dev/null
+++ b/tools/testing/selftests/mm/mseal_helpers.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define FAIL_TEST_IF_FALSE(test_passed) \
+ do { \
+ if (!(test_passed)) { \
+ ksft_test_result_fail("%s: line:%d\n", \
+ __func__, __LINE__); \
+ return; \
+ } \
+ } while (0)
+
+#define SKIP_TEST_IF_FALSE(test_passed) \
+ do { \
+ if (!(test_passed)) { \
+ ksft_test_result_skip("%s: line:%d\n", \
+ __func__, __LINE__); \
+ return; \
+ } \
+ } while (0)
+
+#define REPORT_TEST_PASS() ksft_test_result_pass("%s\n", __func__)
+
+#ifndef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+#endif
+
+#ifndef PKEY_BITS_PER_PKEY
+#define PKEY_BITS_PER_PKEY 2
+#endif
+
+#ifndef PKEY_MASK
+#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+#endif
+
+#ifndef u64
+#define u64 unsigned long long
+#endif
diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
new file mode 100644
index 000000000000..005f29c86484
--- /dev/null
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -0,0 +1,1989 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <asm-generic/unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "../kselftest.h"
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/vfs.h>
+#include <sys/stat.h>
+#include "mseal_helpers.h"
+
+static unsigned long get_vma_size(void *addr, int *prot)
+{
+ FILE *maps;
+ char line[256];
+ int size = 0;
+ uintptr_t addr_start, addr_end;
+ char protstr[5];
+ *prot = 0;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps)
+ return 0;
+
+ while (fgets(line, sizeof(line), maps)) {
+ if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, protstr) == 3) {
+ if (addr_start == (uintptr_t) addr) {
+ size = addr_end - addr_start;
+ if (protstr[0] == 'r')
+ *prot |= 0x4;
+ if (protstr[1] == 'w')
+ *prot |= 0x2;
+ if (protstr[2] == 'x')
+ *prot |= 0x1;
+ break;
+ }
+ }
+ }
+ fclose(maps);
+ return size;
+}
+
+/*
+ * define sys_xyx to call syscall directly.
+ */
+static int sys_mseal(void *start, size_t len)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_mseal, start, len, 0);
+ return sret;
+}
+
+static int sys_mprotect(void *ptr, size_t size, unsigned long prot)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_mprotect, ptr, size, prot);
+ return sret;
+}
+
+static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey);
+ return sret;
+}
+
+static int sys_munmap(void *ptr, size_t size)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_munmap, ptr, size);
+ return sret;
+}
+
+static int sys_madvise(void *start, size_t len, int types)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_madvise, start, len, types);
+ return sret;
+}
+
+static void *sys_mremap(void *addr, size_t old_len, size_t new_len,
+ unsigned long flags, void *new_addr)
+{
+ void *sret;
+
+ errno = 0;
+ sret = (void *) syscall(__NR_mremap, addr, old_len, new_len, flags, new_addr);
+ return sret;
+}
+
+static int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+ int ret = syscall(__NR_pkey_alloc, flags, init_val);
+
+ return ret;
+}
+
+static unsigned int __read_pkey_reg(void)
+{
+ unsigned int pkey_reg = 0;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ unsigned int eax, edx;
+ unsigned int ecx = 0;
+
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+ pkey_reg = eax;
+#endif
+ return pkey_reg;
+}
+
+static void __write_pkey_reg(u64 pkey_reg)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ unsigned int eax = pkey_reg;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (eax), "c" (ecx), "d" (edx));
+#endif
+}
+
+static unsigned long pkey_bit_position(int pkey)
+{
+ return pkey * PKEY_BITS_PER_PKEY;
+}
+
+static u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+ unsigned long shift = pkey_bit_position(pkey);
+
+ /* mask out bits from pkey in old value */
+ reg &= ~((u64)PKEY_MASK << shift);
+ /* OR in new bits for pkey */
+ reg |= (flags & PKEY_MASK) << shift;
+ return reg;
+}
+
+static void set_pkey(int pkey, unsigned long pkey_value)
+{
+ u64 new_pkey_reg;
+
+ new_pkey_reg = set_pkey_bits(__read_pkey_reg(), pkey, pkey_value);
+ __write_pkey_reg(new_pkey_reg);
+}
+
+static void setup_single_address(int size, void **ptrOut)
+{
+ void *ptr;
+
+ ptr = mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ *ptrOut = ptr;
+}
+
+static void setup_single_address_rw(int size, void **ptrOut)
+{
+ void *ptr;
+ unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0);
+ *ptrOut = ptr;
+}
+
+static int clean_single_address(void *ptr, int size)
+{
+ int ret;
+ ret = munmap(ptr, size);
+ return ret;
+}
+
+static int seal_single_address(void *ptr, int size)
+{
+ int ret;
+ ret = sys_mseal(ptr, size);
+ return ret;
+}
+
+bool seal_support(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+
+ ptr = mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (ptr == (void *) -1)
+ return false;
+
+ ret = sys_mseal(ptr, page_size);
+ if (ret < 0)
+ return false;
+
+ return true;
+}
+
+bool pkey_supported(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ int pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED);
+
+ if (pkey > 0)
+ return true;
+#endif
+ return false;
+}
+
+static void test_seal_addseal(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_unmapped_start(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* munmap 2 pages from ptr. */
+ ret = sys_munmap(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail because 2 pages from ptr are unmapped. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail because 2 pages from ptr are unmapped. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ ret = sys_mseal(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_unmapped_middle(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* munmap 2 pages from ptr + page. */
+ ret = sys_munmap(ptr + page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail, since middle 2 pages are unmapped. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail as well. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* we still can add seal to the first page and last page*/
+ ret = sys_mseal(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mseal(ptr + 3 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_unmapped_end(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap last 2 pages. */
+ ret = sys_munmap(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail since last 2 pages are unmapped. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail as well. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* The first 2 pages is not sealed, and can add seals */
+ ret = sys_mseal(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_multiple_vmas(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split the vma into 3. */
+ ret = sys_mprotect(ptr + page_size, 2 * page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will get applied to all 4 pages - 3 VMAs. */
+ ret = sys_mprotect(ptr, size, PROT_READ);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* use mprotect to split the vma into 3. */
+ ret = sys_mprotect(ptr + page_size, 2 * page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mseal get applied to all 4 pages - 3 VMAs. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_split_start(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split at middle */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the first page, this will split the VMA */
+ ret = sys_mseal(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* add seal to the remain 3 pages */
+ ret = sys_mseal(ptr + page_size, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_split_end(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split at middle */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the last page */
+ ret = sys_mseal(ptr + 3 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* Adding seals to the first 3 pages */
+ ret = sys_mseal(ptr, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_invalid_input(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(8 * page_size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ ret = clean_single_address(ptr + 4 * page_size, 4 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* invalid flag */
+ ret = syscall(__NR_mseal, ptr, size, 0x20);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* unaligned address */
+ ret = sys_mseal(ptr + 1, 2 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* length too big */
+ ret = sys_mseal(ptr, 5 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* length overflow */
+ ret = sys_mseal(ptr, UINT64_MAX/page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* start is not in a valid VMA */
+ ret = sys_mseal(ptr - page_size, 5 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_zero_length(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_mprotect(ptr, 0, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal 0 length will be OK, same as mprotect */
+ ret = sys_mseal(ptr, 0);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* verify the 4 pages are not sealed by previous call. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_zero_address(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int prot;
+
+ /* use mmap to change protection. */
+ ptr = mmap(0, size, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ FAIL_TEST_IF_FALSE(ptr == 0);
+
+ size = get_vma_size(ptr, &prot);
+ FAIL_TEST_IF_FALSE(size == 4 * page_size);
+
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* verify the 4 pages are sealed by previous call. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_twice(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* apply the same seal will be OK. idempotent. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_start_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* the first page is sealed. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* pages after the first page is not sealed. */
+ ret = sys_mprotect(ptr + page_size, page_size * 3,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_end_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr + page_size, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* first page is not sealed */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* last 3 page are sealed */
+ ret = sys_mprotect(ptr + page_size, page_size * 3,
+ PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_unalign_len(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, page_size * 2 - 1);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 2 pages are sealed. */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mprotect(ptr + page_size * 2, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_unalign_len_variant_2(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ if (seal) {
+ ret = seal_single_address(ptr, page_size * 2 + 1);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 3 pages are sealed. */
+ ret = sys_mprotect(ptr, page_size * 3, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mprotect(ptr + page_size * 3, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_two_vma(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = seal_single_address(ptr, page_size * 4);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mprotect(ptr + page_size * 2, page_size * 2,
+ PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_two_vma_with_split(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split as two vma. */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mseal can apply across 2 vma, also split them. */
+ if (seal) {
+ ret = seal_single_address(ptr + page_size, page_size * 2);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* the first page is not sealed. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* the second page is sealed. */
+ ret = sys_mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* the third page is sealed. */
+ ret = sys_mprotect(ptr + 2 * page_size, page_size,
+ PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* the fouth page is not sealed. */
+ ret = sys_mprotect(ptr + 3 * page_size, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_partial_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* seal one page. */
+ if (seal) {
+ ret = seal_single_address(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mprotect first 2 page will fail, since the first page are sealed. */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_partial_mprotect_tail(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 2 * page_size;
+ int ret;
+ int prot;
+
+ /*
+ * Check if a partial mseal (that results in two vmas) works correctly.
+ * It might mprotect the first, but it'll never touch the second (msealed) vma.
+ */
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_mprotect(ptr, size, PROT_EXEC);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+
+static void test_seal_mprotect_two_vma_with_gap(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* use mprotect to split. */
+ ret = sys_mprotect(ptr + 3 * page_size, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* use munmap to free two pages in the middle */
+ ret = sys_munmap(ptr + page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail, because there is a gap in the address. */
+ /* notes, internally mprotect still updated the first page. */
+ ret = sys_mprotect(ptr, 4 * page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail as well. */
+ ret = sys_mseal(ptr, 4 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* the first page is not sealed. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret == 0);
+
+ /* the last page is not sealed. */
+ ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret == 0);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_split(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal all 4 pages. */
+ if (seal) {
+ ret = sys_mseal(ptr, 4 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mprotect is sealed. */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+
+ ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mprotect_merge(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split one page. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal first two pages. */
+ if (seal) {
+ ret = sys_mseal(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 2 pages are sealed. */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* last 2 pages are not sealed. */
+ ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret == 0);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_munmap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 4 pages are sealed. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+/*
+ * allocate 4 pages,
+ * use mprotect to split it as two VMAs
+ * seal the whole range
+ * munmap will fail on both
+ */
+static void test_seal_munmap_two_vma(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_munmap(ptr, page_size * 2);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr + page_size, page_size * 2);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+/*
+ * allocate a VMA with 4 pages.
+ * munmap the middle 2 pages.
+ * seal the whole 4 pages, will fail.
+ * munmap the first page will be OK.
+ * munmap the last page will be OK.
+ */
+static void test_seal_munmap_vma_with_gap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_munmap(ptr + page_size, page_size * 2);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ /* can't have gap in the middle. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+ }
+
+ ret = sys_munmap(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr + page_size * 2, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_munmap_partial_across_vmas(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 2 * page_size;
+ int ret;
+ int prot;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ FAIL_TEST_IF_FALSE(get_vma_size(ptr + page_size, &prot) > 0);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_munmap_start_freed(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int prot;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap the first page. */
+ ret = sys_munmap(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the last 3 pages. */
+ if (seal) {
+ ret = sys_mseal(ptr + page_size, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* unmap from the first page. */
+ ret = sys_munmap(ptr, size);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size * 3);
+ } else {
+ /* note: this will be OK, even the first page is */
+ /* already unmapped. */
+ FAIL_TEST_IF_FALSE(!ret);
+
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 0);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_munmap_end_freed(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap last page. */
+ ret = sys_munmap(ptr + page_size * 3, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the first 3 pages. */
+ if (seal) {
+ ret = sys_mseal(ptr, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* unmap all pages. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_munmap_middle_freed(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int prot;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap 2 pages in the middle. */
+ ret = sys_munmap(ptr + page_size, page_size * 2);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the first page. */
+ if (seal) {
+ ret = sys_mseal(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* munmap all 4 pages. */
+ ret = sys_munmap(ptr, size);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ size = get_vma_size(ptr, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+
+ size = get_vma_size(ptr + page_size * 3, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+ } else {
+ FAIL_TEST_IF_FALSE(!ret);
+
+ size = get_vma_size(ptr, &prot);
+ FAIL_TEST_IF_FALSE(size == 0);
+
+ size = get_vma_size(ptr + page_size * 3, &prot);
+ FAIL_TEST_IF_FALSE(size == 0);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_shrink(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* shrink from 4 pages to 2 pages. */
+ ret2 = sys_mremap(ptr, size, 2 * page_size, 0, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == (void *) MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 != (void *) MAP_FAILED);
+
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_expand(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ /* ummap last 2 pages. */
+ ret = sys_munmap(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* expand from 2 page to 4 pages. */
+ ret2 = sys_mremap(ptr, 2 * page_size, 4 * page_size, 0, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_move(bool seal)
+{
+ void *ptr, *newPtr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newPtr);
+ FAIL_TEST_IF_FALSE(newPtr != (void *)-1);
+ ret = clean_single_address(newPtr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* move from ptr to fixed address. */
+ ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mmap_overwrite_prot(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* use mmap to change protection. */
+ ret2 = mmap(ptr, size, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mmap_expand(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 12 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ /* ummap last 4 pages. */
+ ret = sys_munmap(ptr + 8 * page_size, 4 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, 8 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* use mmap to expand. */
+ ret2 = mmap(ptr, size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mmap_shrink(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 12 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* use mmap to shrink. */
+ ret2 = mmap(ptr, 8 * page_size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_shrink_fixed(bool seal)
+{
+ void *ptr;
+ void *newAddr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newAddr);
+ FAIL_TEST_IF_FALSE(newAddr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move and shrink to fixed address */
+ ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
+ newAddr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == newAddr);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_expand_fixed(bool seal)
+{
+ void *ptr;
+ void *newAddr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(page_size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newAddr);
+ FAIL_TEST_IF_FALSE(newAddr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(newAddr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move and expand to fixed address */
+ ret2 = sys_mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED,
+ newAddr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == newAddr);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_move_fixed(bool seal)
+{
+ void *ptr;
+ void *newAddr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newAddr);
+ FAIL_TEST_IF_FALSE(newAddr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(newAddr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move to fixed address */
+ ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == newAddr);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_move_fixed_zero(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /*
+ * MREMAP_FIXED can move the mapping to zero address
+ */
+ ret2 = sys_mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
+ 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 == 0);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_move_dontunmap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move, and don't unmap src addr. */
+ ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ /* kernel will allocate a new address */
+ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_mremap_move_dontunmap_anyaddr(bool seal)
+{
+ void *ptr, *ptr2;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /*
+ * The new address is any address that not allocated.
+ * use allocate/free to similate that.
+ */
+ setup_single_address(size, &ptr2);
+ FAIL_TEST_IF_FALSE(ptr2 != (void *)-1);
+ ret = sys_munmap(ptr2, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /*
+ * remap to any address.
+ */
+ ret2 = sys_mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+ (void *) ptr2);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ /* remap success and return ptr2 */
+ FAIL_TEST_IF_FALSE(ret2 == ptr2);
+ }
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_merge_and_split(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size;
+ int ret;
+ int prot;
+
+ /* (24 RO) */
+ setup_single_address(24 * page_size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect(NONE) to set out boundary */
+ /* (1 NONE) (22 RO) (1 NONE) */
+ ret = sys_mprotect(ptr, page_size, PROT_NONE);
+ FAIL_TEST_IF_FALSE(!ret);
+ ret = sys_mprotect(ptr + 23 * page_size, page_size, PROT_NONE);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 22 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 4);
+
+ /* use mseal to split from beginning */
+ /* (1 NONE) (1 RO_SEAL) (21 RO) (1 NONE) */
+ ret = sys_mseal(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ size = get_vma_size(ptr + 2 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 21 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* use mseal to split from the end. */
+ /* (1 NONE) (1 RO_SEAL) (20 RO) (1 RO_SEAL) (1 NONE) */
+ ret = sys_mseal(ptr + 22 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + 22 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ size = get_vma_size(ptr + 2 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 20 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* merge with prev. */
+ /* (1 NONE) (2 RO_SEAL) (19 RO) (1 RO_SEAL) (1 NONE) */
+ ret = sys_mseal(ptr + 2 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 2 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* merge with after. */
+ /* (1 NONE) (2 RO_SEAL) (18 RO) (2 RO_SEALS) (1 NONE) */
+ ret = sys_mseal(ptr + 21 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + 21 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 2 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* split and merge from prev */
+ /* (1 NONE) (3 RO_SEAL) (17 RO) (2 RO_SEALS) (1 NONE) */
+ ret = sys_mseal(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + 1 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 3 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ ret = sys_munmap(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+ ret = sys_mprotect(ptr + 2 * page_size, page_size, PROT_NONE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* split and merge from next */
+ /* (1 NONE) (3 RO_SEAL) (16 RO) (3 RO_SEALS) (1 NONE) */
+ ret = sys_mseal(ptr + 20 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ size = get_vma_size(ptr + 20 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 3 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* merge from middle of prev and middle of next. */
+ /* (1 NONE) (22 RO_SEAL) (1 NONE) */
+ ret = sys_mseal(ptr + 2 * page_size, 20 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 22 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_discard_ro_anon_on_rw(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address_rw(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't take effect on RW memory. */
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* base seal still apply. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_discard_ro_anon_on_pkey(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int pkey;
+
+ SKIP_TEST_IF_FALSE(pkey_supported());
+
+ setup_single_address_rw(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED);
+ FAIL_TEST_IF_FALSE(pkey > 0);
+
+ ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't take effect if PKRU allow write. */
+ set_pkey(pkey, PKEY_UNRESTRICTED);
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* sealing will take effect if PKRU deny write. */
+ set_pkey(pkey, PKEY_DISABLE_WRITE);
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* base seal still apply. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_discard_ro_anon_on_filebacked(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int fd;
+ unsigned long mapflags = MAP_PRIVATE;
+
+ fd = memfd_create("test", 0);
+ FAIL_TEST_IF_FALSE(fd > 0);
+
+ ret = fallocate(fd, 0, 0, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ptr = mmap(NULL, size, PROT_READ, mapflags, fd, 0);
+ FAIL_TEST_IF_FALSE(ptr != MAP_FAILED);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't apply for file backed mapping. */
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+ close(fd);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_discard_ro_anon_on_shared(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED;
+
+ ptr = mmap(NULL, size, PROT_READ, mapflags, -1, 0);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't apply for shared mapping. */
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_discard_ro_anon(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+static void test_seal_discard_across_vmas(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 2 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+
+static void test_seal_madvise_nodiscard(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /*
+ * Test a random madvise flag like MADV_RANDOM that does not touch page
+ * contents (and thus should work for msealed VMAs). RANDOM also happens to
+ * share bits with other discard-ish flags like REMOVE.
+ */
+ ret = sys_madvise(ptr, size, MADV_RANDOM);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ REPORT_TEST_PASS();
+}
+
+int main(void)
+{
+ bool test_seal = seal_support();
+
+ ksft_print_header();
+
+ if (!test_seal)
+ ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n");
+
+ if (!pkey_supported())
+ ksft_print_msg("PKEY not supported\n");
+
+ ksft_set_plan(88);
+
+ test_seal_addseal();
+ test_seal_unmapped_start();
+ test_seal_unmapped_middle();
+ test_seal_unmapped_end();
+ test_seal_multiple_vmas();
+ test_seal_split_start();
+ test_seal_split_end();
+ test_seal_invalid_input();
+ test_seal_zero_length();
+ test_seal_twice();
+
+ test_seal_mprotect(false);
+ test_seal_mprotect(true);
+
+ test_seal_start_mprotect(false);
+ test_seal_start_mprotect(true);
+
+ test_seal_end_mprotect(false);
+ test_seal_end_mprotect(true);
+
+ test_seal_mprotect_unalign_len(false);
+ test_seal_mprotect_unalign_len(true);
+
+ test_seal_mprotect_unalign_len_variant_2(false);
+ test_seal_mprotect_unalign_len_variant_2(true);
+
+ test_seal_mprotect_two_vma(false);
+ test_seal_mprotect_two_vma(true);
+
+ test_seal_mprotect_two_vma_with_split(false);
+ test_seal_mprotect_two_vma_with_split(true);
+
+ test_seal_mprotect_partial_mprotect(false);
+ test_seal_mprotect_partial_mprotect(true);
+
+ test_seal_mprotect_two_vma_with_gap();
+ test_seal_mprotect_two_vma_with_gap();
+
+ test_seal_mprotect_merge(false);
+ test_seal_mprotect_merge(true);
+
+ test_seal_mprotect_split(false);
+ test_seal_mprotect_split(true);
+
+ test_seal_mprotect_partial_mprotect_tail(false);
+ test_seal_mprotect_partial_mprotect_tail(true);
+
+ test_seal_munmap(false);
+ test_seal_munmap(true);
+ test_seal_munmap_two_vma(false);
+ test_seal_munmap_two_vma(true);
+ test_seal_munmap_vma_with_gap(false);
+ test_seal_munmap_vma_with_gap(true);
+ test_seal_munmap_partial_across_vmas(false);
+ test_seal_munmap_partial_across_vmas(true);
+
+ test_munmap_start_freed(false);
+ test_munmap_start_freed(true);
+ test_munmap_middle_freed(false);
+ test_munmap_middle_freed(true);
+ test_munmap_end_freed(false);
+ test_munmap_end_freed(true);
+
+ test_seal_mremap_shrink(false);
+ test_seal_mremap_shrink(true);
+ test_seal_mremap_expand(false);
+ test_seal_mremap_expand(true);
+ test_seal_mremap_move(false);
+ test_seal_mremap_move(true);
+
+ test_seal_mremap_shrink_fixed(false);
+ test_seal_mremap_shrink_fixed(true);
+ test_seal_mremap_expand_fixed(false);
+ test_seal_mremap_expand_fixed(true);
+ test_seal_mremap_move_fixed(false);
+ test_seal_mremap_move_fixed(true);
+ test_seal_mremap_move_dontunmap(false);
+ test_seal_mremap_move_dontunmap(true);
+ test_seal_mremap_move_fixed_zero(false);
+ test_seal_mremap_move_fixed_zero(true);
+ test_seal_mremap_move_dontunmap_anyaddr(false);
+ test_seal_mremap_move_dontunmap_anyaddr(true);
+ test_seal_madvise_nodiscard(false);
+ test_seal_madvise_nodiscard(true);
+ test_seal_discard_ro_anon(false);
+ test_seal_discard_ro_anon(true);
+ test_seal_discard_across_vmas(false);
+ test_seal_discard_across_vmas(true);
+ test_seal_discard_ro_anon_on_rw(false);
+ test_seal_discard_ro_anon_on_rw(true);
+ test_seal_discard_ro_anon_on_shared(false);
+ test_seal_discard_ro_anon_on_shared(true);
+ test_seal_discard_ro_anon_on_filebacked(false);
+ test_seal_discard_ro_anon_on_filebacked(true);
+ test_seal_mmap_overwrite_prot(false);
+ test_seal_mmap_overwrite_prot(true);
+ test_seal_mmap_expand(false);
+ test_seal_mmap_expand(true);
+ test_seal_mmap_shrink(false);
+ test_seal_mmap_shrink(true);
+
+ test_seal_merge_and_split();
+ test_seal_zero_address();
+
+ test_seal_discard_ro_anon_on_pkey(false);
+ test_seal_discard_ro_anon_on_pkey(true);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
new file mode 100644
index 000000000000..431c1277d83a
--- /dev/null
+++ b/tools/testing/selftests/mm/on-fault-limit.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "../kselftest.h"
+
+static void test_limit(void)
+{
+ struct rlimit lims;
+ void *map;
+
+ if (getrlimit(RLIMIT_MEMLOCK, &lims))
+ ksft_exit_fail_msg("getrlimit: %s\n", strerror(errno));
+
+ if (mlockall(MCL_ONFAULT | MCL_FUTURE))
+ ksft_exit_fail_msg("mlockall: %s\n", strerror(errno));
+
+ map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+
+ ksft_test_result(map == MAP_FAILED, "The map failed respecting mlock limits\n");
+
+ if (map != MAP_FAILED)
+ munmap(map, 2 * lims.rlim_max);
+ munlockall();
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ if (!getuid())
+ ksft_test_result_skip("The test must be run from a normal user\n");
+ else
+ test_limit();
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile
new file mode 100644
index 000000000000..8c8bb39ffa28
--- /dev/null
+++ b/tools/testing/selftests/mm/page_frag/Makefile
@@ -0,0 +1,18 @@
+PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
+KDIR ?= /lib/modules/$(shell uname -r)/build
+
+ifeq ($(V),1)
+Q =
+else
+Q = @
+endif
+
+MODULES = page_frag_test.ko
+
+obj-m += page_frag_test.o
+
+all:
+ +$(Q)make -C $(KDIR) M=$(PAGE_FRAG_TEST_DIR) modules
+
+clean:
+ +$(Q)make -C $(KDIR) M=$(PAGE_FRAG_TEST_DIR) clean
diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c
new file mode 100644
index 000000000000..e806c1866e36
--- /dev/null
+++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_frag cache
+ *
+ * Copyright (C) 2024 Yunsheng Lin <linyunsheng@huawei.com>
+ */
+
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/completion.h>
+#include <linux/ptr_ring.h>
+#include <linux/kthread.h>
+#include <linux/page_frag_cache.h>
+
+#define TEST_FAILED_PREFIX "page_frag_test failed: "
+
+static struct ptr_ring ptr_ring;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_frag_cache test_nc;
+static int test_popped;
+static int test_pushed;
+static bool force_exit;
+
+static int nr_test = 2000000;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_align;
+module_param(test_align, bool, 0);
+MODULE_PARM_DESC(test_align, "use align API for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
+
+static int page_frag_pop_thread(void *arg)
+{
+ struct ptr_ring *ring = arg;
+
+ pr_info("page_frag pop test thread begins on cpu %d\n",
+ smp_processor_id());
+
+ while (test_popped < nr_test) {
+ void *obj = __ptr_ring_consume(ring);
+
+ if (obj) {
+ test_popped++;
+ page_frag_free(obj);
+ } else {
+ if (force_exit)
+ break;
+
+ cond_resched();
+ }
+ }
+
+ if (atomic_dec_and_test(&nthreads))
+ complete(&wait);
+
+ pr_info("page_frag pop test thread exits on cpu %d\n",
+ smp_processor_id());
+
+ return 0;
+}
+
+static int page_frag_push_thread(void *arg)
+{
+ struct ptr_ring *ring = arg;
+
+ pr_info("page_frag push test thread begins on cpu %d\n",
+ smp_processor_id());
+
+ while (test_pushed < nr_test && !force_exit) {
+ void *va;
+ int ret;
+
+ if (test_align) {
+ va = page_frag_alloc_align(&test_nc, test_alloc_len,
+ GFP_KERNEL, SMP_CACHE_BYTES);
+
+ if ((unsigned long)va & (SMP_CACHE_BYTES - 1)) {
+ force_exit = true;
+ WARN_ONCE(true, TEST_FAILED_PREFIX "unaligned va returned\n");
+ }
+ } else {
+ va = page_frag_alloc(&test_nc, test_alloc_len, GFP_KERNEL);
+ }
+
+ if (!va)
+ continue;
+
+ ret = __ptr_ring_produce(ring, va);
+ if (ret) {
+ page_frag_free(va);
+ cond_resched();
+ } else {
+ test_pushed++;
+ }
+ }
+
+ pr_info("page_frag push test thread exits on cpu %d\n",
+ smp_processor_id());
+
+ if (atomic_dec_and_test(&nthreads))
+ complete(&wait);
+
+ return 0;
+}
+
+static int __init page_frag_test_init(void)
+{
+ struct task_struct *tsk_push, *tsk_pop;
+ int last_pushed = 0, last_popped = 0;
+ ktime_t start;
+ u64 duration;
+ int ret;
+
+ page_frag_cache_init(&test_nc);
+ atomic_set(&nthreads, 2);
+ init_completion(&wait);
+
+ if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0 ||
+ !cpu_active(test_push_cpu) || !cpu_active(test_pop_cpu))
+ return -EINVAL;
+
+ ret = ptr_ring_init(&ptr_ring, nr_objs, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_ring,
+ test_push_cpu, "page_frag_push");
+ if (IS_ERR(tsk_push))
+ return PTR_ERR(tsk_push);
+
+ tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_ring,
+ test_pop_cpu, "page_frag_pop");
+ if (IS_ERR(tsk_pop)) {
+ kthread_stop(tsk_push);
+ return PTR_ERR(tsk_pop);
+ }
+
+ start = ktime_get();
+ wake_up_process(tsk_push);
+ wake_up_process(tsk_pop);
+
+ pr_info("waiting for test to complete\n");
+
+ while (!wait_for_completion_timeout(&wait, msecs_to_jiffies(10000))) {
+ /* exit if there is no progress for push or pop size */
+ if (last_pushed == test_pushed || last_popped == test_popped) {
+ WARN_ONCE(true, TEST_FAILED_PREFIX "no progress\n");
+ force_exit = true;
+ continue;
+ }
+
+ last_pushed = test_pushed;
+ last_popped = test_popped;
+ pr_info("page_frag_test progress: pushed = %d, popped = %d\n",
+ test_pushed, test_popped);
+ }
+
+ if (force_exit) {
+ pr_err(TEST_FAILED_PREFIX "exit with error\n");
+ goto out;
+ }
+
+ duration = (u64)ktime_us_delta(ktime_get(), start);
+ pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
+ test_align ? "aligned" : "non-aligned", duration);
+
+out:
+ ptr_ring_cleanup(&ptr_ring, NULL);
+ page_frag_cache_drain(&test_nc);
+
+ return -EAGAIN;
+}
+
+static void __exit page_frag_test_exit(void)
+{
+}
+
+module_init(page_frag_test_init);
+module_exit(page_frag_test_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
+MODULE_DESCRIPTION("Test module for page_frag");
diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
new file mode 100644
index 000000000000..b07acc86f4f0
--- /dev/null
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -0,0 +1,1674 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <malloc.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+#include <linux/types.h>
+#include <linux/memfd.h>
+#include <linux/userfaultfd.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <asm/unistd.h>
+#include <pthread.h>
+#include <sys/resource.h>
+#include <assert.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+
+#define PAGEMAP_BITS_ALL (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
+ PAGE_IS_FILE | PAGE_IS_PRESENT | \
+ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
+ PAGE_IS_HUGE)
+#define PAGEMAP_NON_WRITTEN_BITS (PAGE_IS_WPALLOWED | PAGE_IS_FILE | \
+ PAGE_IS_PRESENT | PAGE_IS_SWAPPED | \
+ PAGE_IS_PFNZERO | PAGE_IS_HUGE)
+
+#define TEST_ITERATIONS 100
+#define PAGEMAP "/proc/self/pagemap"
+int pagemap_fd;
+int uffd;
+unsigned long page_size;
+unsigned int hpage_size;
+const char *progname;
+
+#define LEN(region) ((region.end - region.start)/page_size)
+
+static long pagemap_ioctl(void *start, int len, void *vec, int vec_len, int flag,
+ int max_pages, long required_mask, long anyof_mask, long excluded_mask,
+ long return_mask)
+{
+ struct pm_scan_arg arg;
+
+ arg.start = (uintptr_t)start;
+ arg.end = (uintptr_t)(start + len);
+ arg.vec = (uintptr_t)vec;
+ arg.vec_len = vec_len;
+ arg.flags = flag;
+ arg.size = sizeof(struct pm_scan_arg);
+ arg.max_pages = max_pages;
+ arg.category_mask = required_mask;
+ arg.category_anyof_mask = anyof_mask;
+ arg.category_inverted = excluded_mask;
+ arg.return_mask = return_mask;
+
+ return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+}
+
+static long pagemap_ioc(void *start, int len, void *vec, int vec_len, int flag,
+ int max_pages, long required_mask, long anyof_mask, long excluded_mask,
+ long return_mask, long *walk_end)
+{
+ struct pm_scan_arg arg;
+ int ret;
+
+ arg.start = (uintptr_t)start;
+ arg.end = (uintptr_t)(start + len);
+ arg.vec = (uintptr_t)vec;
+ arg.vec_len = vec_len;
+ arg.flags = flag;
+ arg.size = sizeof(struct pm_scan_arg);
+ arg.max_pages = max_pages;
+ arg.category_mask = required_mask;
+ arg.category_anyof_mask = anyof_mask;
+ arg.category_inverted = excluded_mask;
+ arg.return_mask = return_mask;
+
+ ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+
+ if (walk_end)
+ *walk_end = arg.walk_end;
+
+ return ret;
+}
+
+
+int init_uffd(void)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
+ if (uffd == -1)
+ return uffd;
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+ return -1;
+
+ if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) ||
+ !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) ||
+ !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) ||
+ !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM))
+ return -1;
+
+ return 0;
+}
+
+int wp_init(void *lpBaseAddress, long dwRegionSize)
+{
+ struct uffdio_register uffdio_register;
+ struct uffdio_writeprotect wp;
+
+ uffdio_register.range.start = (unsigned long)lpBaseAddress;
+ uffdio_register.range.len = dwRegionSize;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ ksft_exit_fail_msg("ioctl(UFFDIO_REGISTER) %d %s\n", errno, strerror(errno));
+
+ if (!(uffdio_register.ioctls & UFFDIO_WRITEPROTECT))
+ ksft_exit_fail_msg("ioctl set is incorrect\n");
+
+ wp.range.start = (unsigned long)lpBaseAddress;
+ wp.range.len = dwRegionSize;
+ wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+
+ if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp))
+ ksft_exit_fail_msg("ioctl(UFFDIO_WRITEPROTECT)\n");
+
+ return 0;
+}
+
+int wp_free(void *lpBaseAddress, long dwRegionSize)
+{
+ struct uffdio_register uffdio_register;
+
+ uffdio_register.range.start = (unsigned long)lpBaseAddress;
+ uffdio_register.range.len = dwRegionSize;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
+ ksft_exit_fail_msg("ioctl unregister failure\n");
+ return 0;
+}
+
+int wp_addr_range(void *lpBaseAddress, int dwRegionSize)
+{
+ if (pagemap_ioctl(lpBaseAddress, dwRegionSize, NULL, 0,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", 1, errno, strerror(errno));
+
+ return 0;
+}
+
+void *gethugetlb_mem(int size, int *shmid)
+{
+ char *mem;
+
+ if (shmid) {
+ *shmid = shmget(2, size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (*shmid < 0)
+ return NULL;
+
+ mem = shmat(*shmid, 0, 0);
+ if (mem == (char *)-1) {
+ shmctl(*shmid, IPC_RMID, NULL);
+ ksft_exit_fail_msg("Shared memory attach failure\n");
+ }
+ } else {
+ mem = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_HUGETLB | MAP_PRIVATE, -1, 0);
+ if (mem == MAP_FAILED)
+ return NULL;
+ }
+
+ return mem;
+}
+
+int userfaultfd_tests(void)
+{
+ long mem_size, vec_size, written, num_pages = 16;
+ char *mem, *vec;
+
+ mem_size = num_pages * page_size;
+ mem = mmap(NULL, mem_size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(mem, mem_size);
+
+ /* Change protection of pages differently */
+ mprotect(mem, mem_size/8, PROT_READ|PROT_WRITE);
+ mprotect(mem + 1 * mem_size/8, mem_size/8, PROT_READ);
+ mprotect(mem + 2 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE);
+ mprotect(mem + 3 * mem_size/8, mem_size/8, PROT_READ);
+ mprotect(mem + 4 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE);
+ mprotect(mem + 5 * mem_size/8, mem_size/8, PROT_NONE);
+ mprotect(mem + 6 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE);
+ mprotect(mem + 7 * mem_size/8, mem_size/8, PROT_READ);
+
+ wp_addr_range(mem + (mem_size/16), mem_size - 2 * (mem_size/8));
+ wp_addr_range(mem, mem_size);
+
+ vec_size = mem_size/page_size;
+ vec = malloc(sizeof(struct page_region) * vec_size);
+
+ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__);
+
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+ free(vec);
+ return 0;
+}
+
+int get_reads(struct page_region *vec, int vec_size)
+{
+ int i, sum = 0;
+
+ for (i = 0; i < vec_size; i++)
+ sum += LEN(vec[i]);
+
+ return sum;
+}
+
+int sanity_tests_sd(void)
+{
+ unsigned long long mem_size, vec_size, i, total_pages = 0;
+ long ret, ret2, ret3;
+ int num_pages = 1000;
+ int total_writes, total_reads, reads, count;
+ struct page_region *vec, *vec2;
+ char *mem, *m[2];
+ long walk_end;
+
+ vec_size = num_pages/2;
+ mem_size = num_pages * page_size;
+
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ if (!vec)
+ ksft_exit_fail_msg("error nomem\n");
+
+ vec2 = malloc(sizeof(struct page_region) * vec_size);
+ if (!vec2)
+ ksft_exit_fail_msg("error nomem\n");
+
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ /* 1. wrong operation */
+ ksft_test_result(pagemap_ioctl(mem, 0, vec, vec_size, 0,
+ 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0,
+ "%s Zero range size is valid\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, NULL, vec_size, 0,
+ 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) < 0,
+ "%s output buffer must be specified with size\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, 0, 0,
+ 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0,
+ "%s output buffer can be 0\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, 0, 0, 0,
+ 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0,
+ "%s output buffer can be 0\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, -1,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0,
+ "%s wrong flag specified\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC | 0xFF,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0,
+ "%s flag has extra bits specified\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0,
+ 0, 0, 0, 0, PAGE_IS_WRITTEN) >= 0,
+ "%s no selection mask is specified\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0,
+ 0, PAGE_IS_WRITTEN, PAGE_IS_WRITTEN, 0, 0) == 0,
+ "%s no return mask is specified\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, 0x1000) < 0,
+ "%s wrong return mask specified\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, 0xFFF, PAGE_IS_WRITTEN, 0, PAGE_IS_WRITTEN) < 0,
+ "%s mixture of correct and wrong flag\n", __func__);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, 0, 0, PAGEMAP_BITS_ALL, PAGE_IS_WRITTEN) >= 0,
+ "%s PAGEMAP_BITS_ALL can be specified with PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n",
+ __func__);
+
+ /* 2. Clear area with larger vec size */
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ ksft_test_result(ret >= 0, "%s Clear area with larger vec size\n", __func__);
+
+ /* 3. Repeated pattern of written and non-written pages */
+ for (i = 0; i < mem_size; i += 2 * page_size)
+ mem[i]++;
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0,
+ 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result((unsigned long long)ret == mem_size/(page_size * 2),
+ "%s Repeated pattern of written and non-written pages\n", __func__);
+
+ /* 4. Repeated pattern of written and non-written pages in parts */
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ num_pages/2 - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ ret2 = pagemap_ioctl(mem, mem_size, vec, 2, 0, 0, PAGE_IS_WRITTEN, 0, 0,
+ PAGE_IS_WRITTEN);
+ if (ret2 < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno));
+
+ ret3 = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret3 < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret3, errno, strerror(errno));
+
+ ksft_test_result((ret + ret3) == num_pages/2 && ret2 == 2,
+ "%s Repeated pattern of written and non-written pages in parts %ld %ld %ld\n",
+ __func__, ret, ret3, ret2);
+
+ /* 5. Repeated pattern of written and non-written pages max_pages */
+ for (i = 0; i < mem_size; i += 2 * page_size)
+ mem[i]++;
+ mem[(mem_size/page_size - 1) * page_size]++;
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ num_pages/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret2 < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno));
+
+ ksft_test_result(ret == num_pages/2 && ret2 == 1,
+ "%s Repeated pattern of written and non-written pages max_pages\n",
+ __func__);
+
+ /* 6. only get 2 dirty pages and clear them as well */
+ vec_size = mem_size/page_size;
+ memset(mem, -1, mem_size);
+
+ /* get and clear second and third pages */
+ ret = pagemap_ioctl(mem + page_size, 2 * page_size, vec, 1,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ ret2 = pagemap_ioctl(mem, mem_size, vec2, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret2 < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret2, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) == 2 &&
+ vec[0].start == (uintptr_t)(mem + page_size) &&
+ ret2 == 2 && LEN(vec2[0]) == 1 && vec2[0].start == (uintptr_t)mem &&
+ LEN(vec2[1]) == vec_size - 3 &&
+ vec2[1].start == (uintptr_t)(mem + 3 * page_size),
+ "%s only get 2 written pages and clear them as well\n", __func__);
+
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 7. Two regions */
+ m[0] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (m[0] == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ m[1] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (m[1] == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(m[0], mem_size);
+ wp_init(m[1], mem_size);
+ wp_addr_range(m[0], mem_size);
+ wp_addr_range(m[1], mem_size);
+
+ memset(m[0], 'a', mem_size);
+ memset(m[1], 'b', mem_size);
+
+ wp_addr_range(m[0], mem_size);
+
+ ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0,
+ PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) == mem_size/page_size,
+ "%s Two regions\n", __func__);
+
+ wp_free(m[0], mem_size);
+ wp_free(m[1], mem_size);
+ munmap(m[0], mem_size);
+ munmap(m[1], mem_size);
+
+ free(vec);
+ free(vec2);
+
+ /* 8. Smaller vec */
+ mem_size = 1050 * page_size;
+ vec_size = mem_size/(page_size*2);
+
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ if (!vec)
+ ksft_exit_fail_msg("error nomem\n");
+
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ for (i = 0; i < mem_size/page_size; i += 2)
+ mem[i * page_size]++;
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ total_pages += ret;
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ total_pages += ret;
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ total_pages += ret;
+
+ ksft_test_result(total_pages == mem_size/(page_size*2), "%s Smaller max_pages\n", __func__);
+
+ free(vec);
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+ total_pages = 0;
+
+ /* 9. Smaller vec */
+ mem_size = 10000 * page_size;
+ vec_size = 50;
+
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ if (!vec)
+ ksft_exit_fail_msg("error nomem\n");
+
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ for (count = 0; count < TEST_ITERATIONS; count++) {
+ total_writes = total_reads = 0;
+ walk_end = (long)mem;
+
+ for (i = 0; i < mem_size; i += page_size) {
+ if (rand() % 2) {
+ mem[i]++;
+ total_writes++;
+ }
+ }
+
+ while (total_reads < total_writes) {
+ ret = pagemap_ioc((void *)walk_end, mem_size-(walk_end - (long)mem), vec,
+ vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ if ((unsigned long)ret > vec_size)
+ break;
+
+ reads = get_reads(vec, ret);
+ total_reads += reads;
+ }
+
+ if (total_reads != total_writes)
+ break;
+ }
+
+ ksft_test_result(count == TEST_ITERATIONS, "Smaller vec\n");
+
+ free(vec);
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 10. Walk_end tester */
+ vec_size = 1000;
+ mem_size = vec_size * page_size;
+
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ if (!vec)
+ ksft_exit_fail_msg("error nomem\n");
+
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ memset(mem, 0, mem_size);
+
+ ret = pagemap_ioc(mem, 0, vec, vec_size, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 0 && walk_end == (long)mem,
+ "Walk_end: Same start and end address\n");
+
+ ret = pagemap_ioc(mem, 0, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 0 && walk_end == (long)mem,
+ "Walk_end: Same start and end with WP\n");
+
+ ret = pagemap_ioc(mem, 0, vec, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 0 && walk_end == (long)mem,
+ "Walk_end: Same start and end with 0 output buffer\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
+ "Walk_end: Big vec\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
+ "Walk_end: vec of minimum length\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
+ vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
+ "Walk_end: Max pages specified\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size/2),
+ "Walk_end: Half max pages\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size),
+ "Walk_end: 1 max page\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ -1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size),
+ "Walk_end: max pages\n");
+
+ wp_addr_range(mem, mem_size);
+ for (i = 0; i < mem_size; i += 2 * page_size)
+ mem[i]++;
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+ "Walk_end sparse: Big vec\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2),
+ "Walk_end sparse: vec of minimum length\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, 1, 0,
+ vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2),
+ "Walk_end sparse: Max pages specified\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size/2, 0,
+ vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+ "Walk_end sparse: Max pages specified\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+ "Walk_end sparse: Max pages specified\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result((unsigned long)ret == vec_size/2 && walk_end == (long)(mem + mem_size),
+ "Walk_endsparse : Half max pages\n");
+
+ ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0,
+ 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+ ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2),
+ "Walk_end: 1 max page\n");
+
+ free(vec);
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ return 0;
+}
+
+int base_tests(char *prefix, char *mem, unsigned long long mem_size, int skip)
+{
+ unsigned long long vec_size;
+ int written;
+ struct page_region *vec, *vec2;
+
+ if (skip) {
+ ksft_test_result_skip("%s all new pages must not be written (dirty)\n", prefix);
+ ksft_test_result_skip("%s all pages must be written (dirty)\n", prefix);
+ ksft_test_result_skip("%s all pages dirty other than first and the last one\n",
+ prefix);
+ ksft_test_result_skip("%s PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", prefix);
+ ksft_test_result_skip("%s only middle page dirty\n", prefix);
+ ksft_test_result_skip("%s only two middle pages dirty\n", prefix);
+ return 0;
+ }
+
+ vec_size = mem_size/page_size;
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ vec2 = malloc(sizeof(struct page_region) * vec_size);
+
+ /* 1. all new pages must be not be written (dirty) */
+ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", prefix);
+
+ /* 2. all pages must be written */
+ memset(mem, -1, mem_size);
+
+ written = pagemap_ioctl(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0,
+ PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 1 && LEN(vec[0]) == mem_size/page_size,
+ "%s all pages must be written (dirty)\n", prefix);
+
+ /* 3. all pages dirty other than first and the last one */
+ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ memset(mem + page_size, 0, mem_size - (2 * page_size));
+
+ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 1 && LEN(vec[0]) >= vec_size - 2 && LEN(vec[0]) <= vec_size,
+ "%s all pages dirty other than first and the last one\n", prefix);
+
+ written = pagemap_ioctl(mem, mem_size, vec, 1, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 0,
+ "%s PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", prefix);
+
+ /* 4. only middle page dirty */
+ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ mem[vec_size/2 * page_size]++;
+
+ written = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN,
+ 0, 0, PAGE_IS_WRITTEN);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 1 && LEN(vec[0]) >= 1,
+ "%s only middle page dirty\n", prefix);
+
+ /* 5. only two middle pages dirty and walk over only middle pages */
+ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN | PAGE_IS_HUGE);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ mem[vec_size/2 * page_size]++;
+ mem[(vec_size/2 + 1) * page_size]++;
+
+ written = pagemap_ioctl(&mem[vec_size/2 * page_size], 2 * page_size, vec, 1, 0,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN | PAGE_IS_HUGE);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written == 1 && vec[0].start == (uintptr_t)(&mem[vec_size/2 * page_size])
+ && LEN(vec[0]) == 2,
+ "%s only two middle pages dirty\n", prefix);
+
+ free(vec);
+ free(vec2);
+ return 0;
+}
+
+void *gethugepage(int map_size)
+{
+ int ret;
+ char *map;
+
+ map = memalign(hpage_size, map_size);
+ if (!map)
+ ksft_exit_fail_msg("memalign failed %d %s\n", errno, strerror(errno));
+
+ ret = madvise(map, map_size, MADV_HUGEPAGE);
+ if (ret)
+ return NULL;
+
+ memset(map, 0, map_size);
+
+ return map;
+}
+
+int hpage_unit_tests(void)
+{
+ char *map;
+ int ret, ret2;
+ size_t num_pages = 10;
+ unsigned long long map_size = hpage_size * num_pages;
+ unsigned long long vec_size = map_size/page_size;
+ struct page_region *vec, *vec2;
+
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ vec2 = malloc(sizeof(struct page_region) * vec_size);
+ if (!vec || !vec2)
+ ksft_exit_fail_msg("malloc failed\n");
+
+ map = gethugepage(map_size);
+ if (map) {
+ wp_init(map, map_size);
+ wp_addr_range(map, map_size);
+
+ /* 1. all new huge page must not be written (dirty) */
+ ret = pagemap_ioctl(map, map_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 0, "%s all new huge page must not be written (dirty)\n",
+ __func__);
+
+ /* 2. all the huge page must not be written */
+ ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 0, "%s all the huge page must not be written\n", __func__);
+
+ /* 3. all the huge page must be written and clear dirty as well */
+ memset(map, -1, map_size);
+ ret = pagemap_ioctl(map, map_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && vec[0].start == (uintptr_t)map &&
+ LEN(vec[0]) == vec_size && vec[0].categories == PAGE_IS_WRITTEN,
+ "%s all the huge page must be written and clear\n", __func__);
+
+ /* 4. only middle page written */
+ wp_free(map, map_size);
+ free(map);
+ map = gethugepage(map_size);
+ wp_init(map, map_size);
+ wp_addr_range(map, map_size);
+ map[vec_size/2 * page_size]++;
+
+ ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) > 0,
+ "%s only middle page written\n", __func__);
+
+ wp_free(map, map_size);
+ free(map);
+ } else {
+ ksft_test_result_skip("%s all new huge page must be written\n", __func__);
+ ksft_test_result_skip("%s all the huge page must not be written\n", __func__);
+ ksft_test_result_skip("%s all the huge page must be written and clear\n", __func__);
+ ksft_test_result_skip("%s only middle page written\n", __func__);
+ }
+
+ /* 5. clear first half of huge page */
+ map = gethugepage(map_size);
+ if (map) {
+ wp_init(map, map_size);
+ wp_addr_range(map, map_size);
+
+ memset(map, 0, map_size);
+
+ wp_addr_range(map, map_size/2);
+
+ ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2 &&
+ vec[0].start == (uintptr_t)(map + map_size/2),
+ "%s clear first half of huge page\n", __func__);
+ wp_free(map, map_size);
+ free(map);
+ } else {
+ ksft_test_result_skip("%s clear first half of huge page\n", __func__);
+ }
+
+ /* 6. clear first half of huge page with limited buffer */
+ map = gethugepage(map_size);
+ if (map) {
+ wp_init(map, map_size);
+ wp_addr_range(map, map_size);
+
+ memset(map, 0, map_size);
+
+ ret = pagemap_ioctl(map, map_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2 &&
+ vec[0].start == (uintptr_t)(map + map_size/2),
+ "%s clear first half of huge page with limited buffer\n",
+ __func__);
+ wp_free(map, map_size);
+ free(map);
+ } else {
+ ksft_test_result_skip("%s clear first half of huge page with limited buffer\n",
+ __func__);
+ }
+
+ /* 7. clear second half of huge page */
+ map = gethugepage(map_size);
+ if (map) {
+ wp_init(map, map_size);
+ wp_addr_range(map, map_size);
+
+ memset(map, -1, map_size);
+
+ ret = pagemap_ioctl(map + map_size/2, map_size/2, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size/2,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2,
+ "%s clear second half huge page\n", __func__);
+ wp_free(map, map_size);
+ free(map);
+ } else {
+ ksft_test_result_skip("%s clear second half huge page\n", __func__);
+ }
+
+ /* 8. get half huge page */
+ map = gethugepage(map_size);
+ if (map) {
+ wp_init(map, map_size);
+ wp_addr_range(map, map_size);
+
+ memset(map, -1, map_size);
+ usleep(100);
+
+ ret = pagemap_ioctl(map, map_size, vec, 1,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ hpage_size/(2*page_size), PAGE_IS_WRITTEN, 0, 0,
+ PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec[0]) == hpage_size/(2*page_size),
+ "%s get half huge page\n", __func__);
+
+ ret2 = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
+ if (ret2 < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno));
+
+ ksft_test_result(ret2 == 1 && LEN(vec[0]) == (map_size - hpage_size/2)/page_size,
+ "%s get half huge page\n", __func__);
+
+ wp_free(map, map_size);
+ free(map);
+ } else {
+ ksft_test_result_skip("%s get half huge page\n", __func__);
+ ksft_test_result_skip("%s get half huge page\n", __func__);
+ }
+
+ free(vec);
+ free(vec2);
+ return 0;
+}
+
+int unmapped_region_tests(void)
+{
+ void *start = (void *)0x10000000;
+ int written, len = 0x00040000;
+ long vec_size = len / page_size;
+ struct page_region *vec = malloc(sizeof(struct page_region) * vec_size);
+
+ /* 1. Get written pages */
+ written = pagemap_ioctl(start, len, vec, vec_size, 0, 0,
+ PAGEMAP_NON_WRITTEN_BITS, 0, 0, PAGEMAP_NON_WRITTEN_BITS);
+ if (written < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno));
+
+ ksft_test_result(written >= 0, "%s Get status of pages\n", __func__);
+
+ free(vec);
+ return 0;
+}
+
+static void test_simple(void)
+{
+ int i;
+ char *map;
+ struct page_region vec;
+
+ map = aligned_alloc(page_size, page_size);
+ if (!map)
+ ksft_exit_fail_msg("aligned_alloc failed\n");
+
+ wp_init(map, page_size);
+ wp_addr_range(map, page_size);
+
+ for (i = 0 ; i < TEST_ITERATIONS; i++) {
+ if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 1) {
+ ksft_print_msg("written bit was 1, but should be 0 (i=%d)\n", i);
+ break;
+ }
+
+ wp_addr_range(map, page_size);
+ /* Write something to the page to get the written bit enabled on the page */
+ map[0]++;
+
+ if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 0) {
+ ksft_print_msg("written bit was 0, but should be 1 (i=%d)\n", i);
+ break;
+ }
+
+ wp_addr_range(map, page_size);
+ }
+ wp_free(map, page_size);
+ free(map);
+
+ ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__);
+}
+
+int sanity_tests(void)
+{
+ unsigned long long mem_size, vec_size;
+ long ret, fd, i, buf_size;
+ struct page_region *vec;
+ char *mem, *fmem;
+ struct stat sbuf;
+ char *tmp_buf;
+
+ /* 1. wrong operation */
+ mem_size = 10 * page_size;
+ vec_size = mem_size / page_size;
+
+ vec = malloc(sizeof(struct page_region) * vec_size);
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED || vec == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC,
+ 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) >= 0,
+ "%s WP op can be specified with !PAGE_IS_WRITTEN\n", __func__);
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) >= 0,
+ "%s required_mask specified\n", __func__);
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ 0, PAGEMAP_BITS_ALL, 0, PAGEMAP_BITS_ALL) >= 0,
+ "%s anyof_mask specified\n", __func__);
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ 0, 0, PAGEMAP_BITS_ALL, PAGEMAP_BITS_ALL) >= 0,
+ "%s excluded_mask specified\n", __func__);
+ ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ PAGEMAP_BITS_ALL, PAGEMAP_BITS_ALL, 0,
+ PAGEMAP_BITS_ALL) >= 0,
+ "%s required_mask and anyof_mask specified\n", __func__);
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 2. Get sd and present pages with anyof_mask */
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ memset(mem, 0, mem_size);
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ 0, PAGEMAP_BITS_ALL, 0, PAGEMAP_BITS_ALL);
+ ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size &&
+ (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) ==
+ (PAGE_IS_WRITTEN | PAGE_IS_PRESENT),
+ "%s Get sd and present pages with anyof_mask\n", __func__);
+
+ /* 3. Get sd and present pages with required_mask */
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL);
+ ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size &&
+ (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) ==
+ (PAGE_IS_WRITTEN | PAGE_IS_PRESENT),
+ "%s Get all the pages with required_mask\n", __func__);
+
+ /* 4. Get sd and present pages with required_mask and anyof_mask */
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, PAGE_IS_PRESENT, 0, PAGEMAP_BITS_ALL);
+ ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size &&
+ (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) ==
+ (PAGE_IS_WRITTEN | PAGE_IS_PRESENT),
+ "%s Get sd and present pages with required_mask and anyof_mask\n",
+ __func__);
+
+ /* 5. Don't get sd pages */
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, 0, PAGE_IS_WRITTEN, PAGEMAP_BITS_ALL);
+ ksft_test_result(ret == 0, "%s Don't get sd pages\n", __func__);
+
+ /* 6. Don't get present pages */
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0,
+ PAGE_IS_PRESENT, 0, PAGE_IS_PRESENT, PAGEMAP_BITS_ALL);
+ ksft_test_result(ret == 0, "%s Don't get present pages\n", __func__);
+
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 8. Find written present pages with return mask */
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ memset(mem, 0, mem_size);
+
+ ret = pagemap_ioctl(mem, mem_size, vec, vec_size,
+ PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0,
+ 0, PAGEMAP_BITS_ALL, 0, PAGE_IS_WRITTEN);
+ ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size &&
+ vec[0].categories == PAGE_IS_WRITTEN,
+ "%s Find written present pages with return mask\n", __func__);
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 9. Memory mapped file */
+ fd = open(progname, O_RDONLY);
+ if (fd < 0)
+ ksft_exit_fail_msg("%s Memory mapped file\n", __func__);
+
+ ret = stat(progname, &sbuf);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno));
+
+ fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (fmem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
+
+ tmp_buf = malloc(sbuf.st_size);
+ memcpy(tmp_buf, fmem, sbuf.st_size);
+
+ ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0,
+ 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS);
+
+ ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem &&
+ LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) &&
+ (vec[0].categories & PAGE_IS_FILE),
+ "%s Memory mapped file\n", __func__);
+
+ munmap(fmem, sbuf.st_size);
+ close(fd);
+
+ /* 10. Create and read/write to a memory mapped file */
+ buf_size = page_size * 10;
+
+ fd = open(__FILE__".tmp2", O_RDWR | O_CREAT, 0666);
+ if (fd < 0)
+ ksft_exit_fail_msg("Read/write to memory: %s\n",
+ strerror(errno));
+
+ for (i = 0; i < buf_size; i++)
+ if (write(fd, "c", 1) < 0)
+ ksft_exit_fail_msg("Create and read/write to a memory mapped file\n");
+
+ fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (fmem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
+
+ wp_init(fmem, buf_size);
+ wp_addr_range(fmem, buf_size);
+
+ for (i = 0; i < buf_size; i++)
+ fmem[i] = 'z';
+
+ msync(fmem, buf_size, MS_SYNC);
+
+ ret = pagemap_ioctl(fmem, buf_size, vec, vec_size, 0, 0,
+ PAGE_IS_WRITTEN, PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_FILE, 0,
+ PAGEMAP_BITS_ALL);
+
+ ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem &&
+ LEN(vec[0]) == (buf_size/page_size) &&
+ (vec[0].categories & PAGE_IS_WRITTEN),
+ "%s Read/write to memory\n", __func__);
+
+ wp_free(fmem, buf_size);
+ munmap(fmem, buf_size);
+ close(fd);
+
+ free(vec);
+ return 0;
+}
+
+int mprotect_tests(void)
+{
+ int ret;
+ char *mem, *mem2;
+ struct page_region vec;
+ int pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+
+ if (pagemap_fd < 0) {
+ fprintf(stderr, "open() failed\n");
+ exit(1);
+ }
+
+ /* 1. Map two pages */
+ mem = mmap(0, 2 * page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ wp_init(mem, 2 * page_size);
+ wp_addr_range(mem, 2 * page_size);
+
+ /* Populate both pages. */
+ memset(mem, 1, 2 * page_size);
+
+ ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN,
+ 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec) == 2, "%s Both pages written\n", __func__);
+
+ /* 2. Start tracking */
+ wp_addr_range(mem, 2 * page_size);
+
+ ksft_test_result(pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0,
+ PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 0,
+ "%s Both pages are not written (dirty)\n", __func__);
+
+ /* 3. Remap the second page */
+ mem2 = mmap(mem + page_size, page_size, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANON|MAP_FIXED, -1, 0);
+ if (mem2 == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ wp_init(mem2, page_size);
+ wp_addr_range(mem2, page_size);
+
+ /* Protect + unprotect. */
+ mprotect(mem, page_size, PROT_NONE);
+ mprotect(mem, 2 * page_size, PROT_READ);
+ mprotect(mem, 2 * page_size, PROT_READ|PROT_WRITE);
+
+ /* Modify both pages. */
+ memset(mem, 2, 2 * page_size);
+
+ /* Protect + unprotect. */
+ mprotect(mem, page_size, PROT_NONE);
+ mprotect(mem, page_size, PROT_READ);
+ mprotect(mem, page_size, PROT_READ|PROT_WRITE);
+
+ ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN,
+ 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec) == 2,
+ "%s Both pages written after remap and mprotect\n", __func__);
+
+ /* 4. Clear and make the pages written */
+ wp_addr_range(mem, 2 * page_size);
+
+ memset(mem, 'A', 2 * page_size);
+
+ ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN,
+ 0, 0, PAGE_IS_WRITTEN);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ ksft_test_result(ret == 1 && LEN(vec) == 2,
+ "%s Clear and make the pages written\n", __func__);
+
+ wp_free(mem, 2 * page_size);
+ munmap(mem, 2 * page_size);
+ return 0;
+}
+
+/* transact test */
+static const unsigned int nthreads = 6, pages_per_thread = 32, access_per_thread = 8;
+static pthread_barrier_t start_barrier, end_barrier;
+static unsigned int extra_thread_faults;
+static unsigned int iter_count = 1000;
+static volatile int finish;
+
+static ssize_t get_dirty_pages_reset(char *mem, unsigned int count,
+ int reset, int page_size)
+{
+ struct pm_scan_arg arg = {0};
+ struct page_region rgns[256];
+ unsigned long long i, j;
+ long ret;
+ int cnt;
+
+ arg.size = sizeof(struct pm_scan_arg);
+ arg.start = (uintptr_t)mem;
+ arg.max_pages = count;
+ arg.end = (uintptr_t)(mem + count * page_size);
+ arg.vec = (uintptr_t)rgns;
+ arg.vec_len = sizeof(rgns) / sizeof(*rgns);
+ if (reset)
+ arg.flags |= PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC;
+ arg.category_mask = PAGE_IS_WRITTEN;
+ arg.return_mask = PAGE_IS_WRITTEN;
+
+ ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+ if (ret < 0)
+ ksft_exit_fail_msg("ioctl failed\n");
+
+ cnt = 0;
+ for (i = 0; i < (unsigned long)ret; ++i) {
+ if (rgns[i].categories != PAGE_IS_WRITTEN)
+ ksft_exit_fail_msg("wrong flags\n");
+
+ for (j = 0; j < LEN(rgns[i]); ++j)
+ cnt++;
+ }
+
+ return cnt;
+}
+
+void *thread_proc(void *mem)
+{
+ int *m = mem;
+ long curr_faults, faults;
+ struct rusage r;
+ unsigned int i;
+ int ret;
+
+ if (getrusage(RUSAGE_THREAD, &r))
+ ksft_exit_fail_msg("getrusage\n");
+
+ curr_faults = r.ru_minflt;
+
+ while (!finish) {
+ ret = pthread_barrier_wait(&start_barrier);
+ if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD)
+ ksft_exit_fail_msg("pthread_barrier_wait\n");
+
+ for (i = 0; i < access_per_thread; ++i)
+ __atomic_add_fetch(m + i * (0x1000 / sizeof(*m)), 1, __ATOMIC_SEQ_CST);
+
+ ret = pthread_barrier_wait(&end_barrier);
+ if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD)
+ ksft_exit_fail_msg("pthread_barrier_wait\n");
+
+ if (getrusage(RUSAGE_THREAD, &r))
+ ksft_exit_fail_msg("getrusage\n");
+
+ faults = r.ru_minflt - curr_faults;
+ if (faults < access_per_thread)
+ ksft_exit_fail_msg("faults < access_per_thread");
+
+ __atomic_add_fetch(&extra_thread_faults, faults - access_per_thread,
+ __ATOMIC_SEQ_CST);
+ curr_faults = r.ru_minflt;
+ }
+
+ return NULL;
+}
+
+static void transact_test(int page_size)
+{
+ unsigned int i, count, extra_pages;
+ unsigned int c;
+ pthread_t th;
+ char *mem;
+ int ret;
+
+ if (pthread_barrier_init(&start_barrier, NULL, nthreads + 1))
+ ksft_exit_fail_msg("pthread_barrier_init\n");
+
+ if (pthread_barrier_init(&end_barrier, NULL, nthreads + 1))
+ ksft_exit_fail_msg("pthread_barrier_init\n");
+
+ mem = mmap(NULL, 0x1000 * nthreads * pages_per_thread, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("Error mmap %s.\n", strerror(errno));
+
+ wp_init(mem, 0x1000 * nthreads * pages_per_thread);
+ wp_addr_range(mem, 0x1000 * nthreads * pages_per_thread);
+
+ memset(mem, 0, 0x1000 * nthreads * pages_per_thread);
+
+ count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size);
+ ksft_test_result(count > 0, "%s count %u\n", __func__, count);
+ count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size);
+ ksft_test_result(count == 0, "%s count %u\n", __func__, count);
+
+ finish = 0;
+ for (i = 0; i < nthreads; ++i)
+ pthread_create(&th, NULL, thread_proc, mem + 0x1000 * i * pages_per_thread);
+
+ extra_pages = 0;
+ for (i = 0; i < iter_count; ++i) {
+ count = 0;
+
+ ret = pthread_barrier_wait(&start_barrier);
+ if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD)
+ ksft_exit_fail_msg("pthread_barrier_wait\n");
+
+ count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1,
+ page_size);
+
+ ret = pthread_barrier_wait(&end_barrier);
+ if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD)
+ ksft_exit_fail_msg("pthread_barrier_wait\n");
+
+ if (count > nthreads * access_per_thread)
+ ksft_exit_fail_msg("Too big count %u expected %u, iter %u\n",
+ count, nthreads * access_per_thread, i);
+
+ c = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size);
+ count += c;
+
+ if (c > nthreads * access_per_thread) {
+ ksft_test_result_fail(" %s count > nthreads\n", __func__);
+ return;
+ }
+
+ if (count != nthreads * access_per_thread) {
+ /*
+ * The purpose of the test is to make sure that no page updates are lost
+ * when the page updates and read-resetting soft dirty flags are performed
+ * in parallel. However, it is possible that the application will get the
+ * soft dirty flags twice on the two consecutive read-resets. This seems
+ * unavoidable as soft dirty flag is handled in software through page faults
+ * in kernel. While the updating the flags is supposed to be synchronized
+ * between page fault handling and read-reset, it is possible that
+ * read-reset happens after page fault PTE update but before the application
+ * re-executes write instruction. So read-reset gets the flag, clears write
+ * access and application gets page fault again for the same write.
+ */
+ if (count < nthreads * access_per_thread) {
+ ksft_test_result_fail("Lost update, iter %u, %u vs %u.\n", i, count,
+ nthreads * access_per_thread);
+ return;
+ }
+
+ extra_pages += count - nthreads * access_per_thread;
+ }
+ }
+
+ pthread_barrier_wait(&start_barrier);
+ finish = 1;
+ pthread_barrier_wait(&end_barrier);
+
+ ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %u.\n", __func__,
+ extra_pages,
+ 100.0 * extra_pages / (iter_count * nthreads * access_per_thread),
+ extra_thread_faults);
+}
+
+int main(int __attribute__((unused)) argc, char *argv[])
+{
+ int shmid, buf_size, fd, i, ret;
+ unsigned long long mem_size;
+ char *mem, *map, *fmem;
+ struct stat sbuf;
+
+ progname = argv[0];
+
+ ksft_print_header();
+
+ if (init_uffd())
+ ksft_exit_pass();
+
+ ksft_set_plan(115);
+
+ page_size = getpagesize();
+ hpage_size = read_pmd_pagesize();
+
+ pagemap_fd = open(PAGEMAP, O_RDONLY);
+ if (pagemap_fd < 0)
+ return -EINVAL;
+
+ /* 1. Sanity testing */
+ sanity_tests_sd();
+
+ /* 2. Normal page testing */
+ mem_size = 10 * page_size;
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ base_tests("Page testing:", mem, mem_size, 0);
+
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 3. Large page testing */
+ mem_size = 512 * 10 * page_size;
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (mem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem\n");
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ base_tests("Large Page testing:", mem, mem_size, 0);
+
+ wp_free(mem, mem_size);
+ munmap(mem, mem_size);
+
+ /* 4. Huge page testing */
+ map = gethugepage(hpage_size);
+ if (map) {
+ wp_init(map, hpage_size);
+ wp_addr_range(map, hpage_size);
+ base_tests("Huge page testing:", map, hpage_size, 0);
+ wp_free(map, hpage_size);
+ free(map);
+ } else {
+ base_tests("Huge page testing:", NULL, 0, 1);
+ }
+
+ /* 5. SHM Hugetlb page testing */
+ mem_size = 2*1024*1024;
+ mem = gethugetlb_mem(mem_size, &shmid);
+ if (mem) {
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ base_tests("Hugetlb shmem testing:", mem, mem_size, 0);
+
+ wp_free(mem, mem_size);
+ shmctl(shmid, IPC_RMID, NULL);
+ } else {
+ base_tests("Hugetlb shmem testing:", NULL, 0, 1);
+ }
+
+ /* 6. Hugetlb page testing */
+ mem = gethugetlb_mem(mem_size, NULL);
+ if (mem) {
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ base_tests("Hugetlb mem testing:", mem, mem_size, 0);
+
+ wp_free(mem, mem_size);
+ } else {
+ base_tests("Hugetlb mem testing:", NULL, 0, 1);
+ }
+
+ /* 7. File Hugetlb testing */
+ mem_size = 2*1024*1024;
+ fd = memfd_create("uffd-test", MFD_HUGETLB | MFD_NOEXEC_SEAL);
+ if (fd < 0)
+ ksft_exit_fail_msg("uffd-test creation failed %d %s\n", errno, strerror(errno));
+ mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (mem != MAP_FAILED) {
+ wp_init(mem, mem_size);
+ wp_addr_range(mem, mem_size);
+
+ base_tests("Hugetlb shmem testing:", mem, mem_size, 0);
+
+ wp_free(mem, mem_size);
+ shmctl(shmid, IPC_RMID, NULL);
+ } else {
+ base_tests("Hugetlb shmem testing:", NULL, 0, 1);
+ }
+ close(fd);
+
+ /* 8. File memory testing */
+ buf_size = page_size * 10;
+
+ fd = open(__FILE__".tmp0", O_RDWR | O_CREAT, 0777);
+ if (fd < 0)
+ ksft_exit_fail_msg("Create and read/write to a memory mapped file: %s\n",
+ strerror(errno));
+
+ for (i = 0; i < buf_size; i++)
+ if (write(fd, "c", 1) < 0)
+ ksft_exit_fail_msg("Create and read/write to a memory mapped file\n");
+
+ ret = stat(__FILE__".tmp0", &sbuf);
+ if (ret < 0)
+ ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno));
+
+ fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (fmem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
+
+ wp_init(fmem, sbuf.st_size);
+ wp_addr_range(fmem, sbuf.st_size);
+
+ base_tests("File memory testing:", fmem, sbuf.st_size, 0);
+
+ wp_free(fmem, sbuf.st_size);
+ munmap(fmem, sbuf.st_size);
+ close(fd);
+
+ /* 9. File memory testing */
+ buf_size = page_size * 10;
+
+ fd = memfd_create(__FILE__".tmp00", MFD_NOEXEC_SEAL);
+ if (fd < 0)
+ ksft_exit_fail_msg("Create and read/write to a memory mapped file: %s\n",
+ strerror(errno));
+
+ if (ftruncate(fd, buf_size))
+ ksft_exit_fail_msg("Error ftruncate\n");
+
+ for (i = 0; i < buf_size; i++)
+ if (write(fd, "c", 1) < 0)
+ ksft_exit_fail_msg("Create and read/write to a memory mapped file\n");
+
+ fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (fmem == MAP_FAILED)
+ ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
+
+ wp_init(fmem, buf_size);
+ wp_addr_range(fmem, buf_size);
+
+ base_tests("File anonymous memory testing:", fmem, buf_size, 0);
+
+ wp_free(fmem, buf_size);
+ munmap(fmem, buf_size);
+ close(fd);
+
+ /* 10. Huge page tests */
+ hpage_unit_tests();
+
+ /* 11. Iterative test */
+ test_simple();
+
+ /* 12. Mprotect test */
+ mprotect_tests();
+
+ /* 13. Transact test */
+ transact_test(page_size);
+
+ /* 14. Sanity testing */
+ sanity_tests();
+
+ /*15. Unmapped address test */
+ unmapped_region_tests();
+
+ /* 16. Userfaultfd tests */
+ userfaultfd_tests();
+
+ close(pagemap_fd);
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c
new file mode 100644
index 000000000000..866ac023baf5
--- /dev/null
+++ b/tools/testing/selftests/mm/pfnmap.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Basic VM_PFNMAP tests relying on mmap() of '/dev/mem'
+ *
+ * Copyright 2025, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "vm_util.h"
+
+static sigjmp_buf sigjmp_buf_env;
+
+static void signal_handler(int sig)
+{
+ siglongjmp(sigjmp_buf_env, -EFAULT);
+}
+
+static int test_read_access(char *addr, size_t size, size_t pagesize)
+{
+ size_t offs;
+ int ret;
+
+ if (signal(SIGSEGV, signal_handler) == SIG_ERR)
+ return -EINVAL;
+
+ ret = sigsetjmp(sigjmp_buf_env, 1);
+ if (!ret) {
+ for (offs = 0; offs < size; offs += pagesize)
+ /* Force a read that the compiler cannot optimize out. */
+ *((volatile char *)(addr + offs));
+ }
+ if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
+ return -EINVAL;
+
+ return ret;
+}
+
+static int find_ram_target(off_t *phys_addr,
+ unsigned long long pagesize)
+{
+ unsigned long long start, end;
+ char line[80], *end_ptr;
+ FILE *file;
+
+ /* Search /proc/iomem for the first suitable "System RAM" range. */
+ file = fopen("/proc/iomem", "r");
+ if (!file)
+ return -errno;
+
+ while (fgets(line, sizeof(line), file)) {
+ /* Ignore any child nodes. */
+ if (!isalnum(line[0]))
+ continue;
+
+ if (!strstr(line, "System RAM\n"))
+ continue;
+
+ start = strtoull(line, &end_ptr, 16);
+ /* Skip over the "-" */
+ end_ptr++;
+ /* Make end "exclusive". */
+ end = strtoull(end_ptr, NULL, 16) + 1;
+
+ /* Actual addresses are not exported */
+ if (!start && !end)
+ break;
+
+ /* We need full pages. */
+ start = (start + pagesize - 1) & ~(pagesize - 1);
+ end &= ~(pagesize - 1);
+
+ if (start != (off_t)start)
+ break;
+
+ /* We need two pages. */
+ if (end > start + 2 * pagesize) {
+ fclose(file);
+ *phys_addr = start;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+FIXTURE(pfnmap)
+{
+ off_t phys_addr;
+ size_t pagesize;
+ int dev_mem_fd;
+ char *addr1;
+ size_t size1;
+ char *addr2;
+ size_t size2;
+};
+
+FIXTURE_SETUP(pfnmap)
+{
+ self->pagesize = getpagesize();
+
+ /* We'll require two physical pages throughout our tests ... */
+ if (find_ram_target(&self->phys_addr, self->pagesize))
+ SKIP(return, "Cannot find ram target in '/proc/iomem'\n");
+
+ self->dev_mem_fd = open("/dev/mem", O_RDONLY);
+ if (self->dev_mem_fd < 0)
+ SKIP(return, "Cannot open '/dev/mem'\n");
+
+ self->size1 = self->pagesize * 2;
+ self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED,
+ self->dev_mem_fd, self->phys_addr);
+ if (self->addr1 == MAP_FAILED)
+ SKIP(return, "Cannot mmap '/dev/mem'\n");
+
+ /* ... and want to be able to read from them. */
+ if (test_read_access(self->addr1, self->size1, self->pagesize))
+ SKIP(return, "Cannot read-access mmap'ed '/dev/mem'\n");
+
+ self->size2 = 0;
+ self->addr2 = MAP_FAILED;
+}
+
+FIXTURE_TEARDOWN(pfnmap)
+{
+ if (self->addr2 != MAP_FAILED)
+ munmap(self->addr2, self->size2);
+ if (self->addr1 != MAP_FAILED)
+ munmap(self->addr1, self->size1);
+ if (self->dev_mem_fd >= 0)
+ close(self->dev_mem_fd);
+}
+
+TEST_F(pfnmap, madvise_disallowed)
+{
+ int advices[] = {
+ MADV_DONTNEED,
+ MADV_DONTNEED_LOCKED,
+ MADV_FREE,
+ MADV_WIPEONFORK,
+ MADV_COLD,
+ MADV_PAGEOUT,
+ MADV_POPULATE_READ,
+ MADV_POPULATE_WRITE,
+ };
+ int i;
+
+ /* All these advices must be rejected. */
+ for (i = 0; i < ARRAY_SIZE(advices); i++) {
+ EXPECT_LT(madvise(self->addr1, self->pagesize, advices[i]), 0);
+ EXPECT_EQ(errno, EINVAL);
+ }
+}
+
+TEST_F(pfnmap, munmap_split)
+{
+ /*
+ * Unmap the first page. This munmap() call is not really expected to
+ * fail, but we might be able to trigger other internal issues.
+ */
+ ASSERT_EQ(munmap(self->addr1, self->pagesize), 0);
+
+ /*
+ * Remap the first page while the second page is still mapped. This
+ * makes sure that any PAT tracking on x86 will allow for mmap()'ing
+ * a page again while some parts of the first mmap() are still
+ * around.
+ */
+ self->size2 = self->pagesize;
+ self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED,
+ self->dev_mem_fd, self->phys_addr);
+ ASSERT_NE(self->addr2, MAP_FAILED);
+}
+
+TEST_F(pfnmap, mremap_fixed)
+{
+ char *ret;
+
+ /* Reserve a destination area. */
+ self->size2 = self->size1;
+ self->addr2 = mmap(NULL, self->size2, PROT_READ, MAP_ANON | MAP_PRIVATE,
+ -1, 0);
+ ASSERT_NE(self->addr2, MAP_FAILED);
+
+ /* mremap() over our destination. */
+ ret = mremap(self->addr1, self->size1, self->size2,
+ MREMAP_FIXED | MREMAP_MAYMOVE, self->addr2);
+ ASSERT_NE(ret, MAP_FAILED);
+}
+
+TEST_F(pfnmap, mremap_shrink)
+{
+ char *ret;
+
+ /* Shrinking is expected to work. */
+ ret = mremap(self->addr1, self->size1, self->size1 - self->pagesize, 0);
+ ASSERT_NE(ret, MAP_FAILED);
+}
+
+TEST_F(pfnmap, mremap_expand)
+{
+ /*
+ * Growing is not expected to work, and getting it right would
+ * be challenging. So this test primarily serves as an early warning
+ * that something that probably should never work suddenly works.
+ */
+ self->size2 = self->size1 + self->pagesize;
+ self->addr2 = mremap(self->addr1, self->size1, self->size2, MREMAP_MAYMOVE);
+ ASSERT_EQ(self->addr2, MAP_FAILED);
+}
+
+TEST_F(pfnmap, fork)
+{
+ pid_t pid;
+ int ret;
+
+ /* fork() a child and test if the child can access the pages. */
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (!pid) {
+ EXPECT_EQ(test_read_access(self->addr1, self->size1,
+ self->pagesize), 0);
+ exit(0);
+ }
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
new file mode 100644
index 000000000000..8e9685e03c44
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Arm Ltd.
+ */
+
+#ifndef _PKEYS_ARM64_H
+#define _PKEYS_ARM64_H
+
+#include "vm_util.h"
+/* for signal frame parsing */
+#include "../arm64/signal/testcases/testcases.h"
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 288
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 289
+# define SYS_pkey_free 290
+#endif
+#define MCONTEXT_IP(mc) mc.pc
+#define MCONTEXT_TRAPNO(mc) -1
+
+#define PKEY_MASK 0xf
+
+#define POE_NONE 0x0
+#define POE_X 0x2
+#define POE_RX 0x3
+#define POE_RWX 0x7
+
+#define NR_PKEYS 8
+#define NR_RESERVED_PKEYS 1 /* pkey-0 */
+
+#define PKEY_REG_ALLOW_ALL 0x77777777
+#define PKEY_REG_ALLOW_NONE 0x0
+
+#define PKEY_BITS_PER_PKEY 4
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+#undef HPAGE_SIZE
+#define HPAGE_SIZE default_huge_page_size()
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+static inline u64 __read_pkey_reg(void)
+{
+ u64 pkey_reg = 0;
+
+ // POR_EL0
+ asm volatile("mrs %0, S3_3_c10_c2_4" : "=r" (pkey_reg));
+
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ u64 por = pkey_reg;
+
+ dprintf4("%s() changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+
+ // POR_EL0
+ asm volatile("msr S3_3_c10_c2_4, %0\nisb" :: "r" (por) :);
+
+ dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+ /* No simple way to determine this */
+ return 1;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return pkey * PKEY_BITS_PER_PKEY;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ return NR_RESERVED_PKEYS;
+}
+
+static inline void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+}
+
+static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ return PTR_ERR_ENOTSUP;
+}
+
+#define set_pkey_bits set_pkey_bits
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+ u32 shift = pkey_bit_position(pkey);
+ u64 new_val = POE_RWX;
+
+ /* mask out bits from pkey in old value */
+ reg &= ~((u64)PKEY_MASK << shift);
+
+ if (flags & PKEY_DISABLE_ACCESS)
+ new_val = POE_X;
+ else if (flags & PKEY_DISABLE_WRITE)
+ new_val = POE_RX;
+
+ /* OR in new bits for pkey */
+ reg |= new_val << shift;
+
+ return reg;
+}
+
+#define get_pkey_bits get_pkey_bits
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /*
+ * shift down the relevant bits to the lowest four, then
+ * mask off all the other higher bits
+ */
+ u32 perm = (reg >> shift) & PKEY_MASK;
+
+ if (perm == POE_X)
+ return PKEY_DISABLE_ACCESS;
+ if (perm == POE_RX)
+ return PKEY_DISABLE_WRITE;
+ return 0;
+}
+
+static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
+{
+ struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt);
+ struct poe_context *poe_ctx =
+ (struct poe_context *) get_header(ctx, POE_MAGIC,
+ sizeof(uctxt->uc_mcontext), NULL);
+ if (poe_ctx)
+ poe_ctx->por_el0 = pkey;
+}
+
+#endif /* _PKEYS_ARM64_H */
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
new file mode 100644
index 000000000000..ea404f80e6cb
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -0,0 +1,222 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+#include <linux/mman.h>
+#include <linux/types.h>
+
+#include "../kselftest.h"
+
+/* Define some kernel-like types */
+typedef __u8 u8;
+typedef __u16 u16;
+typedef __u32 u32;
+typedef __u64 u64;
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+extern int dprint_in_signal;
+
+extern int test_nr;
+extern int iteration_nr;
+
+#ifdef __GNUC__
+__printf(1, 2)
+#endif
+static inline void sigsafe_printf(const char *format, ...)
+{
+ va_list ap;
+
+ if (!dprint_in_signal) {
+ va_start(ap, format);
+ vprintf(format, ap);
+ va_end(ap);
+ } else {
+ int ret;
+ /*
+ * No printf() functions are signal-safe.
+ * They deadlock easily. Write the format
+ * string to get some output, even if
+ * incomplete.
+ */
+ ret = write(1, format, strlen(format));
+ if (ret < 0)
+ exit(1);
+ }
+}
+#define dprintf_level(level, args...) do { \
+ if (level <= DEBUG_LEVEL) \
+ sigsafe_printf(args); \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do { \
+ if (!(condition)) { \
+ dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+ __FILE__, __LINE__, \
+ test_nr, iteration_nr); \
+ dprintf0("errno at assert: %d", errno); \
+ abort_hooks(); \
+ exit(__LINE__); \
+ } \
+} while (0)
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+#ifndef noinline
+# define noinline __attribute__((noinline))
+#endif
+#ifndef __maybe_unused
+# define __maybe_unused __attribute__((__unused__))
+#endif
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+int sys_pkey_free(unsigned long pkey);
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey);
+
+/* For functions called from protection_keys.c only */
+noinline int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
+#elif defined(__aarch64__) /* arch */
+#include "pkey-arm64.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
+#ifndef PKEY_MASK
+#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+#endif
+
+/*
+ * FIXME: Remove once the generic PKEY_UNRESTRICTED definition is merged.
+ */
+#ifndef PKEY_UNRESTRICTED
+#define PKEY_UNRESTRICTED 0x0
+#endif
+
+#ifndef set_pkey_bits
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /* mask out bits from pkey in old value */
+ reg &= ~((u64)PKEY_MASK << shift);
+ /* OR in new bits for pkey */
+ reg |= (flags & PKEY_MASK) << shift;
+ return reg;
+}
+#endif
+
+#ifndef get_pkey_bits
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /*
+ * shift down the relevant bits to the lowest two, then
+ * mask off all the other higher bits
+ */
+ return ((reg >> shift) & PKEY_MASK);
+}
+#endif
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 _read_pkey_reg(int line)
+{
+ u64 pkey_reg = __read_pkey_reg();
+
+ dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
+ " shadow: %016llx\n",
+ line, pkey_reg, shadow_pkey_reg);
+ assert(pkey_reg == shadow_pkey_reg);
+
+ return pkey_reg;
+}
+
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
+
+static inline void write_pkey_reg(u64 pkey_reg)
+{
+ dprintf4("%s() changing %016llx to %016llx\n", __func__,
+ __read_pkey_reg(), pkey_reg);
+ /* will do the shadow check for us: */
+ read_pkey_reg();
+ __write_pkey_reg(pkey_reg);
+ shadow_pkey_reg = pkey_reg;
+ dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
+ pkey_reg, __read_pkey_reg());
+}
+
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to) \
+ ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to) \
+ ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+ return &si->si_pkey;
+#else
+ return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
+}
+
+static inline int kernel_has_pkeys(void)
+{
+ /* try allocating a key and see if it succeeds */
+ int ret = sys_pkey_alloc(0, PKEY_UNRESTRICTED);
+ if (ret <= 0) {
+ return 0;
+ }
+ sys_pkey_free(ret);
+ return 1;
+}
+
+static inline int is_pkeys_supported(void)
+{
+ /* check if the cpu supports pkeys */
+ if (!cpu_has_pkeys()) {
+ dprintf1("SKIP: %s: no CPU support\n", __func__);
+ return 0;
+ }
+
+ /* check if the kernel supports pkeys */
+ if (!kernel_has_pkeys()) {
+ dprintf1("SKIP: %s: no kernel support\n", __func__);
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
new file mode 100644
index 000000000000..17bf2d1b0192
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-powerpc.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#include <sys/stat.h>
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 384
+# define SYS_pkey_free 385
+#endif
+#define REG_IP_IDX PT_NIP
+#define MCONTEXT_IP(mc) mc.gp_regs[REG_IP_IDX]
+#define MCONTEXT_TRAPNO(mc) mc.gp_regs[REG_TRAPNO]
+#define REG_TRAPNO PT_TRAP
+#define MCONTEXT_FPREGS
+#define gregs gp_regs
+#define fpregs fp_regs
+#define si_pkey_offset 0x20
+
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+
+#define NR_PKEYS 32
+#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey
+ and 24 other keys that cannot be
+ represented in the PTE */
+#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0,
+ pkey-1 and exec-only key */
+#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1,
+ pkey-31 and exec-only key */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL << 24)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+ u64 pkey_reg;
+
+ asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ u64 amr = pkey_reg;
+
+ dprintf4("%s() changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+
+ asm volatile("isync; mtspr 0xd, %0; isync"
+ : : "r" ((unsigned long)(amr)) : "memory");
+
+ dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+ /* No simple way to determine this */
+ return 1;
+}
+
+static inline bool arch_is_powervm()
+{
+ struct stat buf;
+
+ if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
+ (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
+ (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
+ return true;
+
+ return false;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ if (sysconf(_SC_PAGESIZE) == 4096)
+ return NR_RESERVED_PKEYS_4K;
+ else
+ if (arch_is_powervm())
+ return NR_RESERVED_PKEYS_64K_4KEYS;
+ else
+ return NR_RESERVED_PKEYS_64K_3KEYS;
+}
+
+static inline void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+ /*
+ * powerpc does not allow userspace to change permissions of exec-only
+ * keys since those keys are not allocated by userspace. The signal
+ * handler wont be able to reset the permissions, which means the code
+ * will infinitely continue to segfault here.
+ */
+ return;
+}
+
+#define REPEAT_8(s) s s s s s s s s
+#define REPEAT_64(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) \
+ REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s)
+#define REPEAT_512(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) \
+ REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s)
+#define REPEAT_4096(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) \
+ REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s)
+#define REPEAT_16384(s) REPEAT_4096(s) REPEAT_4096(s) \
+ REPEAT_4096(s) REPEAT_4096(s)
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(REPEAT_16384("nop\n"))
+
+static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int ret;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+
+ ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+ if (ret) {
+ perror("subpage_perm");
+ return PTR_ERR_ENOTSUP;
+ }
+
+ ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+ pkey_assert(!ret);
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+ return ptr;
+}
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
new file mode 100644
index 000000000000..f7ecd335df1e
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_X86_H
+#define _PKEYS_X86_H
+
+#ifdef __i386__
+
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#else
+
+#define REG_IP_IDX REG_RIP
+#define si_pkey_offset 0x20
+
+#endif
+
+#define MCONTEXT_IP(mc) mc.gregs[REG_IP_IDX]
+#define MCONTEXT_TRAPNO(mc) mc.gregs[REG_TRAPNO]
+#define MCONTEXT_FPREGS
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE 0x2
+#endif
+
+#define NR_PKEYS 16
+#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL<<21)
+#define PAGE_SIZE 4096
+#define MB (1<<20)
+
+#define PKEY_REG_ALLOW_NONE 0x55555555
+
+static inline void __page_o_noops(void)
+{
+ /* 8-bytes of instruction * 512 bytes = 1 page */
+ asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+ unsigned int eax, edx;
+ unsigned int ecx = 0;
+ unsigned pkey_reg;
+
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+ pkey_reg = eax;
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ unsigned int eax = pkey_reg;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ dprintf4("%s() changing %016llx to %016llx\n", __func__,
+ __read_pkey_reg(), pkey_reg);
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (eax), "c" (ecx), "d" (edx));
+ assert(pkey_reg == __read_pkey_reg());
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pkeys(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+
+ __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx);
+
+ if (!(ecx & X86_FEATURE_PKU)) {
+ dprintf2("cpu does not have PKU\n");
+ return 0;
+ }
+ if (!(ecx & X86_FEATURE_OSPKE)) {
+ dprintf2("cpu does not have OSPKE\n");
+ return 0;
+ }
+ return 1;
+}
+
+static inline int cpu_max_xsave_size(void)
+{
+ unsigned long XSTATE_CPUID = 0xd;
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+
+ __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx);
+ return ecx;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return pkey * PKEY_BITS_PER_PKEY;
+}
+
+#define XSTATE_PKEY_BIT (9)
+#define XSTATE_PKEY 0x200
+#define XSTATE_BV_OFFSET 512
+
+static inline int pkey_reg_xstate_offset(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ int xstate_offset;
+ int xstate_size = 0;
+ unsigned long XSTATE_CPUID = 0xd;
+ int leaf;
+
+ /* assume that XSTATE_PKEY is set in XCR0 */
+ leaf = XSTATE_PKEY_BIT;
+ {
+ __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx);
+
+ if (leaf == XSTATE_PKEY_BIT) {
+ xstate_offset = ebx;
+ xstate_size = eax;
+ }
+ }
+
+ if (xstate_size == 0) {
+ printf("could not find size/offset of PKEY in xsave state\n");
+ return 0;
+ }
+
+ return xstate_offset;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ return NR_RESERVED_PKEYS;
+}
+
+static inline void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+ int ptr_contents;
+
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ return PTR_ERR_ENOTSUP;
+}
+
+#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
new file mode 100644
index 000000000000..b5e076a564c9
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -0,0 +1,546 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * The testcases in this file exercise various flows related to signal handling,
+ * using an alternate signal stack, with the default pkey (pkey 0) disabled.
+ *
+ * Compile with:
+ * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
+ * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <linux/mman.h>
+#include <errno.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <limits.h>
+
+#include "pkey-helpers.h"
+
+#define STACK_SIZE PTHREAD_STACK_MIN
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+static siginfo_t siginfo = {0};
+
+/*
+ * We need to use inline assembly instead of glibc's syscall because glibc's
+ * syscall will attempt to access the PLT in order to call a library function
+ * which is protected by MPK 0 which we don't have access to.
+ */
+static inline __always_inline
+long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
+{
+ unsigned long ret;
+#ifdef __x86_64__
+ register long r10 asm("r10") = a4;
+ register long r8 asm("r8") = a5;
+ register long r9 asm("r9") = a6;
+ asm volatile ("syscall"
+ : "=a"(ret)
+ : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9)
+ : "rcx", "r11", "memory");
+#elif defined __i386__
+ asm volatile ("int $0x80"
+ : "=a"(ret)
+ : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
+ : "memory");
+#elif defined __aarch64__
+ register long x0 asm("x0") = a1;
+ register long x1 asm("x1") = a2;
+ register long x2 asm("x2") = a3;
+ register long x3 asm("x3") = a4;
+ register long x4 asm("x4") = a5;
+ register long x5 asm("x5") = a6;
+ register long x8 asm("x8") = n;
+ asm volatile ("svc #0"
+ : "=r"(x0)
+ : "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(x8)
+ : "memory");
+ ret = x0;
+#else
+# error syscall_raw() not implemented
+#endif
+ return ret;
+}
+
+static inline long clone_raw(unsigned long flags, void *stack,
+ int *parent_tid, int *child_tid)
+{
+ long a1 = flags;
+ long a2 = (long)stack;
+ long a3 = (long)parent_tid;
+#if defined(__x86_64__) || defined(__i386)
+ long a4 = (long)child_tid;
+ long a5 = 0;
+#elif defined(__aarch64__)
+ long a4 = 0;
+ long a5 = (long)child_tid;
+#else
+# error clone_raw() not implemented
+#endif
+
+ return syscall_raw(SYS_clone, a1, a2, a3, a4, a5, 0);
+}
+
+/*
+ * Returns the most restrictive pkey register value that can be used by the
+ * tests.
+ */
+static inline u64 pkey_reg_restrictive_default(void)
+{
+ /*
+ * Disallow everything except execution on pkey 0, so that each caller
+ * doesn't need to enable it explicitly (the selftest code runs with
+ * its code mapped with pkey 0).
+ */
+ return set_pkey_bits(PKEY_REG_ALLOW_NONE, 0, PKEY_DISABLE_ACCESS);
+}
+
+static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
+{
+ pthread_mutex_lock(&mutex);
+
+ memcpy(&siginfo, info, sizeof(siginfo_t));
+
+ pthread_cond_signal(&cond);
+ pthread_mutex_unlock(&mutex);
+
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+}
+
+static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext)
+{
+ pthread_mutex_lock(&mutex);
+
+ memcpy(&siginfo, info, sizeof(siginfo_t));
+
+ pthread_cond_signal(&cond);
+ pthread_mutex_unlock(&mutex);
+}
+
+static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext)
+{
+ /*
+ * pkru should be the init_pkru value which enabled MPK 0 so
+ * we can use library functions.
+ */
+ printf("%s invoked.\n", __func__);
+}
+
+static void raise_sigusr2(void)
+{
+ pid_t tid = 0;
+
+ tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0);
+
+ syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0);
+
+ /*
+ * We should return from the signal handler here and be able to
+ * return to the interrupted thread.
+ */
+}
+
+static void *thread_segv_with_pkey0_disabled(void *ptr)
+{
+ /* Disable MPK 0 (and all others too) */
+ __write_pkey_reg(pkey_reg_restrictive_default());
+
+ /* Segfault (with SEGV_MAPERR) */
+ *(volatile int *)NULL = 1;
+ return NULL;
+}
+
+static void *thread_segv_pkuerr_stack(void *ptr)
+{
+ /* Disable MPK 0 (and all others too) */
+ __write_pkey_reg(pkey_reg_restrictive_default());
+
+ /* After we disable MPK 0, we can't access the stack to return */
+ return NULL;
+}
+
+static void *thread_segv_maperr_ptr(void *ptr)
+{
+ stack_t *stack = ptr;
+ u64 pkey_reg;
+
+ /*
+ * Setup alternate signal stack, which should be pkey_mprotect()ed by
+ * MPK 0. The thread's stack cannot be used for signals because it is
+ * not accessible by the default init_pkru value of 0x55555554.
+ */
+ syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
+
+ /* Disable MPK 0. Only MPK 1 is enabled. */
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
+
+ /* Segfault */
+ *(volatile int *)NULL = 1;
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ return NULL;
+}
+
+/*
+ * Verify that the sigsegv handler is invoked when pkey 0 is disabled.
+ * Note that the new thread stack and the alternate signal stack is
+ * protected by MPK 0.
+ */
+static void test_sigsegv_handler_with_pkey0_disabled(void)
+{
+ struct sigaction sa;
+ pthread_attr_t attr;
+ pthread_t thr;
+
+ sa.sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = sigsegv_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL);
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ ksft_test_result(siginfo.si_signo == SIGSEGV &&
+ siginfo.si_code == SEGV_MAPERR &&
+ siginfo.si_addr == NULL,
+ "%s\n", __func__);
+}
+
+/*
+ * Verify that the sigsegv handler is invoked when pkey 0 is disabled.
+ * Note that the new thread stack and the alternate signal stack is
+ * protected by MPK 0, which renders them inaccessible when MPK 0
+ * is disabled. So just the return from the thread should cause a
+ * segfault with SEGV_PKUERR.
+ */
+static void test_sigsegv_handler_cannot_access_stack(void)
+{
+ struct sigaction sa;
+ pthread_attr_t attr;
+ pthread_t thr;
+
+ sa.sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = sigsegv_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL);
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ ksft_test_result(siginfo.si_signo == SIGSEGV &&
+ siginfo.si_code == SEGV_PKUERR,
+ "%s\n", __func__);
+}
+
+/*
+ * Verify that the sigsegv handler that uses an alternate signal stack
+ * is correctly invoked for a thread which uses a non-zero MPK to protect
+ * its own stack, and disables all other MPKs (including 0).
+ */
+static void test_sigsegv_handler_with_different_pkey_for_stack(void)
+{
+ struct sigaction sa;
+ static stack_t sigstack;
+ void *stack;
+ int pkey;
+ int parent_pid = 0;
+ int child_pid = 0;
+ u64 pkey_reg;
+
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+
+ sa.sa_sigaction = sigsegv_handler;
+
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ assert(stack != MAP_FAILED);
+
+ /* Allow access to MPK 0 and MPK 1 */
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
+
+ /* Protect the new stack with MPK 1 */
+ pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED);
+ sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+
+ /* Set up alternate signal stack that will use the default MPK */
+ sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ sigstack.ss_flags = 0;
+ sigstack.ss_size = STACK_SIZE;
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ /* Use clone to avoid newer glibcs using rseq on new threads */
+ long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+ CLONE_DETACHED,
+ stack + STACK_SIZE,
+ &parent_pid,
+ &child_pid);
+
+ if (ret < 0) {
+ errno = -ret;
+ perror("clone");
+ } else if (ret == 0) {
+ thread_segv_maperr_ptr(&sigstack);
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ }
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ ksft_test_result(siginfo.si_signo == SIGSEGV &&
+ siginfo.si_code == SEGV_MAPERR &&
+ siginfo.si_addr == NULL,
+ "%s\n", __func__);
+}
+
+/*
+ * Verify that the PKRU value set by the application is correctly
+ * restored upon return from signal handling.
+ */
+static void test_pkru_preserved_after_sigusr1(void)
+{
+ struct sigaction sa;
+ u64 pkey_reg;
+
+ /* Allow access to MPK 0 and an arbitrary set of keys */
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 3, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 7, PKEY_UNRESTRICTED);
+
+ sa.sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = sigusr1_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&siginfo, 0, sizeof(siginfo));
+
+ __write_pkey_reg(pkey_reg);
+
+ raise(SIGUSR1);
+
+ pthread_mutex_lock(&mutex);
+ while (siginfo.si_signo == 0)
+ pthread_cond_wait(&cond, &mutex);
+ pthread_mutex_unlock(&mutex);
+
+ /* Ensure the pkru value is the same after returning from signal. */
+ ksft_test_result(pkey_reg == __read_pkey_reg() &&
+ siginfo.si_signo == SIGUSR1,
+ "%s\n", __func__);
+}
+
+static noinline void *thread_sigusr2_self(void *ptr)
+{
+ /*
+ * A const char array like "Resuming after SIGUSR2" won't be stored on
+ * the stack and the code could access it via an offset from the program
+ * counter. This makes sure it's on the function's stack frame.
+ */
+ char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ',
+ 'a', 'f', 't', 'e', 'r', ' ',
+ 'S', 'I', 'G', 'U', 'S', 'R', '2',
+ '.', '.', '.', '\n', '\0'};
+ stack_t *stack = ptr;
+ u64 pkey_reg;
+
+ /*
+ * Setup alternate signal stack, which should be pkey_mprotect()ed by
+ * MPK 0. The thread's stack cannot be used for signals because it is
+ * not accessible by the default init_pkru value of 0x55555554.
+ */
+ syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
+
+ /* Disable MPK 0. Only MPK 2 is enabled. */
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
+
+ raise_sigusr2();
+
+ /* Do something, to show the thread resumed execution after the signal */
+ syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0);
+
+ /*
+ * We can't return to test_pkru_sigreturn because it
+ * will attempt to use a %rbp value which is on the stack
+ * of the main thread.
+ */
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ return NULL;
+}
+
+/*
+ * Verify that sigreturn is able to restore altstack even if the thread had
+ * disabled pkey 0.
+ */
+static void test_pkru_sigreturn(void)
+{
+ struct sigaction sa = {0};
+ static stack_t sigstack;
+ void *stack;
+ int pkey;
+ int parent_pid = 0;
+ int child_pid = 0;
+ u64 pkey_reg;
+
+ sa.sa_handler = SIG_DFL;
+ sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
+
+ /*
+ * For this testcase, we do not want to handle SIGSEGV. Reset handler
+ * to default so that the application can crash if it receives SIGSEGV.
+ */
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+ sa.sa_sigaction = sigusr2_handler;
+ sigemptyset(&sa.sa_mask);
+
+ if (sigaction(SIGUSR2, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ assert(stack != MAP_FAILED);
+
+ /*
+ * Allow access to MPK 0 and MPK 2. The child thread (to be created
+ * later in this flow) will have its stack protected by MPK 2, whereas
+ * the current thread's stack is protected by the default MPK 0. Hence
+ * both need to be enabled.
+ */
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
+
+ /* Protect the stack with MPK 2 */
+ pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED);
+ sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey);
+
+ /* Set up alternate signal stack that will use the default MPK */
+ sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ sigstack.ss_flags = 0;
+ sigstack.ss_size = STACK_SIZE;
+
+ /* Use clone to avoid newer glibcs using rseq on new threads */
+ long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+ CLONE_DETACHED,
+ stack + STACK_SIZE,
+ &parent_pid,
+ &child_pid);
+
+ if (ret < 0) {
+ errno = -ret;
+ perror("clone");
+ } else if (ret == 0) {
+ thread_sigusr2_self(&sigstack);
+ syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0);
+ }
+
+ child_pid = ret;
+ /* Check that thread exited */
+ do {
+ sched_yield();
+ ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0);
+ } while (ret != -ESRCH && ret != -EINVAL);
+
+ ksft_test_result_pass("%s\n", __func__);
+}
+
+static void (*pkey_tests[])(void) = {
+ test_sigsegv_handler_with_pkey0_disabled,
+ test_sigsegv_handler_cannot_access_stack,
+ test_sigsegv_handler_with_different_pkey_for_stack,
+ test_pkru_preserved_after_sigusr1,
+ test_pkru_sigreturn
+};
+
+int main(int argc, char *argv[])
+{
+ int i;
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(pkey_tests));
+
+ if (!is_pkeys_supported())
+ ksft_exit_skip("pkeys not supported\n");
+
+ for (i = 0; i < ARRAY_SIZE(pkey_tests); i++)
+ (*pkey_tests[i])();
+
+ ksft_finished();
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/pkey_util.c b/tools/testing/selftests/mm/pkey_util.c
new file mode 100644
index 000000000000..255b332f7a08
--- /dev/null
+++ b/tools/testing/selftests/mm/pkey_util.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define __SANE_USERSPACE_TYPES__
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "pkey-helpers.h"
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+ int ret = syscall(SYS_pkey_alloc, flags, init_val);
+ dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+ __func__, flags, init_val, ret, errno);
+ return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+ int ret = syscall(SYS_pkey_free, pkey);
+ dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+ return ret;
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int sret;
+
+ dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+ ptr, size, orig_prot, pkey);
+
+ errno = 0;
+ sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey);
+ if (errno) {
+ dprintf2("SYS_mprotect_key sret: %d\n", sret);
+ dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+ dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+ if (DEBUG_LEVEL >= 2)
+ perror("SYS_mprotect_pkey");
+ }
+ return sret;
+}
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
new file mode 100644
index 000000000000..23ebec367015
--- /dev/null
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -0,0 +1,1794 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * There are examples in here of:
+ * * how to set protection keys on memory
+ * * how to set/clear bits in pkey registers (the rights register)
+ * * how to handle SEGV_PKUERR signals and extract pkey-relevant
+ * information from the siginfo
+ *
+ * Things to add:
+ * make sure KSM and KSM COW breaking works
+ * prefault pages in at malloc, or not
+ * protect MPX bounds tables with protection keys?
+ * make sure VMA splitting/merging is working correctly
+ * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
+ * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
+ * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
+ *
+ * Compile like this:
+ * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <linux/elf.h>
+#include <linux/futex.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <setjmp.h>
+
+#include "pkey-helpers.h"
+
+int iteration_nr = 1;
+int test_nr;
+
+u64 shadow_pkey_reg;
+int dprint_in_signal;
+
+noinline int read_ptr(int *ptr)
+{
+ /* Keep GCC from optimizing this away somehow */
+ barrier();
+ return *ptr;
+}
+
+static void cat_into_file(char *str, char *file)
+{
+ int fd = open(file, O_RDWR);
+ int ret;
+
+ dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
+ /*
+ * these need to be raw because they are called under
+ * pkey_assert()
+ */
+ if (fd < 0) {
+ fprintf(stderr, "error opening '%s'\n", str);
+ perror("error: ");
+ exit(__LINE__);
+ }
+
+ ret = write(fd, str, strlen(str));
+ if (ret != strlen(str)) {
+ perror("write to file failed");
+ fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
+ exit(__LINE__);
+ }
+ close(fd);
+}
+
+#if CONTROL_TRACING > 0
+static int warned_tracing;
+static int tracing_root_ok(void)
+{
+ if (geteuid() != 0) {
+ if (!warned_tracing)
+ fprintf(stderr, "WARNING: not run as root, "
+ "can not do tracing control\n");
+ warned_tracing = 1;
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+static void tracing_on(void)
+{
+#if CONTROL_TRACING > 0
+#define TRACEDIR "/sys/kernel/tracing"
+ char pidstr[32];
+
+ if (!tracing_root_ok())
+ return;
+
+ sprintf(pidstr, "%d", getpid());
+ cat_into_file("0", TRACEDIR "/tracing_on");
+ cat_into_file("\n", TRACEDIR "/trace");
+ if (1) {
+ cat_into_file("function_graph", TRACEDIR "/current_tracer");
+ cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
+ } else {
+ cat_into_file("nop", TRACEDIR "/current_tracer");
+ }
+ cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
+ cat_into_file("1", TRACEDIR "/tracing_on");
+ dprintf1("enabled tracing\n");
+#endif
+}
+
+static void tracing_off(void)
+{
+#if CONTROL_TRACING > 0
+ if (!tracing_root_ok())
+ return;
+ cat_into_file("0", "/sys/kernel/tracing/tracing_on");
+#endif
+}
+
+void abort_hooks(void)
+{
+ fprintf(stderr, "running %s()...\n", __func__);
+ tracing_off();
+#ifdef SLEEP_ON_ABORT
+ sleep(SLEEP_ON_ABORT);
+#endif
+}
+
+/*
+ * This attempts to have roughly a page of instructions followed by a few
+ * instructions that do a write, and another page of instructions. That
+ * way, we are pretty sure that the write is in the second page of
+ * instructions and has at least a page of padding behind it.
+ *
+ * *That* lets us be sure to madvise() away the write instruction, which
+ * will then fault, which makes sure that the fault code handles
+ * execute-only memory properly.
+ */
+#if defined(__powerpc64__) || defined(__aarch64__)
+/* This way, both 4K and 64K alignment are maintained */
+__attribute__((__aligned__(65536)))
+#else
+__attribute__((__aligned__(PAGE_SIZE)))
+#endif
+static void lots_o_noops_around_write(int *write_to_me)
+{
+ dprintf3("running %s()\n", __func__);
+ __page_o_noops();
+ /* Assume this happens in the second page of instructions: */
+ *write_to_me = __LINE__;
+ /* pad out by another page: */
+ __page_o_noops();
+ dprintf3("%s() done\n", __func__);
+}
+
+static void dump_mem(void *dumpme, int len_bytes)
+{
+ char *c = (void *)dumpme;
+ int i;
+
+ for (i = 0; i < len_bytes; i += sizeof(u64)) {
+ u64 *ptr = (u64 *)(c + i);
+ dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
+ }
+}
+
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+ u64 pkey_reg = __read_pkey_reg();
+
+ dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+ __func__, pkey, flags, 0, 0);
+ dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
+
+ return (u32) get_pkey_bits(pkey_reg, pkey);
+}
+
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+ u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+ u64 old_pkey_reg = __read_pkey_reg();
+ u64 new_pkey_reg;
+
+ /* make sure that 'rights' only contains the bits we expect: */
+ assert(!(rights & ~mask));
+
+ /* modify bits accordingly in old pkey_reg and assign it */
+ new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
+
+ __write_pkey_reg(new_pkey_reg);
+
+ dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
+ " pkey_reg now: %016llx old_pkey_reg: %016llx\n",
+ __func__, pkey, rights, flags, 0, __read_pkey_reg(),
+ old_pkey_reg);
+ return 0;
+}
+
+static void pkey_disable_set(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights;
+
+ dprintf1("START->%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights |= flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
+ assert(!ret);
+ /* pkey_reg and flags have the same format */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+ dprintf1("%s(%d) shadow: 0x%016llx\n",
+ __func__, pkey, shadow_pkey_reg);
+
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
+ __func__, pkey, read_pkey_reg());
+ dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+}
+
+static void pkey_disable_clear(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights &= ~flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, 0);
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
+ pkey, read_pkey_reg());
+}
+
+__maybe_unused static void pkey_write_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+__maybe_unused static void pkey_write_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+__maybe_unused static void pkey_access_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+__maybe_unused static void pkey_access_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
+}
+
+static char *si_code_str(int si_code)
+{
+ if (si_code == SEGV_MAPERR)
+ return "SEGV_MAPERR";
+ if (si_code == SEGV_ACCERR)
+ return "SEGV_ACCERR";
+ if (si_code == SEGV_BNDERR)
+ return "SEGV_BNDERR";
+ if (si_code == SEGV_PKUERR)
+ return "SEGV_PKUERR";
+ return "UNKNOWN";
+}
+
+static int pkey_faults;
+static int last_si_pkey = -1;
+static void signal_handler(int signum, siginfo_t *si, void *vucontext)
+{
+ ucontext_t *uctxt = vucontext;
+ int trapno;
+ unsigned long ip;
+#ifdef MCONTEXT_FPREGS
+ char *fpregs;
+#endif
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ u32 *pkey_reg_ptr;
+ int pkey_reg_offset;
+#endif /* arch */
+ u64 siginfo_pkey;
+ u32 *si_pkey_ptr;
+
+ dprint_in_signal = 1;
+ dprintf1(">>>>===============SIGSEGV============================\n");
+ dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+ __func__, __LINE__,
+ __read_pkey_reg(), shadow_pkey_reg);
+
+ trapno = MCONTEXT_TRAPNO(uctxt->uc_mcontext);
+ ip = MCONTEXT_IP(uctxt->uc_mcontext);
+#ifdef MCONTEXT_FPREGS
+ fpregs = (char *) uctxt->uc_mcontext.fpregs;
+#endif
+
+ dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+ __func__, trapno, ip, si_code_str(si->si_code),
+ si->si_code);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#ifdef __i386__
+ /*
+ * 32-bit has some extra padding so that userspace can tell whether
+ * the XSTATE header is present in addition to the "legacy" FPU
+ * state. We just assume that it is here.
+ */
+ fpregs += 0x70;
+#endif /* i386 */
+ pkey_reg_offset = pkey_reg_xstate_offset();
+ pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
+
+ /*
+ * If we got a PKEY fault, we *HAVE* to have at least one bit set in
+ * here.
+ */
+ dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
+ if (DEBUG_LEVEL > 4)
+ dump_mem(pkey_reg_ptr - 128, 256);
+ pkey_assert(*pkey_reg_ptr);
+#endif /* arch */
+
+ dprintf1("siginfo: %p\n", si);
+#ifdef MCONTEXT_FPREGS
+ dprintf1(" fpregs: %p\n", fpregs);
+#endif
+
+ if ((si->si_code == SEGV_MAPERR) ||
+ (si->si_code == SEGV_ACCERR) ||
+ (si->si_code == SEGV_BNDERR)) {
+ printf("non-PK si_code, exiting...\n");
+ exit(4);
+ }
+
+ si_pkey_ptr = siginfo_get_pkey_ptr(si);
+ dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+ dump_mem((u8 *)si_pkey_ptr - 8, 24);
+ siginfo_pkey = *si_pkey_ptr;
+ pkey_assert(siginfo_pkey < NR_PKEYS);
+ last_si_pkey = siginfo_pkey;
+
+ /*
+ * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+ * checking
+ */
+ dprintf1("signal pkey_reg from pkey_reg: %016llx\n",
+ __read_pkey_reg());
+ dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+ *(u64 *)pkey_reg_ptr = 0x00000000;
+ dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
+#elif defined(__powerpc64__) /* arch */
+ /* restore access and let the faulting instruction continue */
+ pkey_access_allow(siginfo_pkey);
+#elif defined(__aarch64__)
+ aarch64_write_signal_pkey(uctxt, PKEY_REG_ALLOW_ALL);
+#endif /* arch */
+ pkey_faults++;
+ dprintf1("<<<<==================================================\n");
+ dprint_in_signal = 0;
+}
+
+static void sig_chld(int x)
+{
+ dprint_in_signal = 1;
+ dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
+ dprint_in_signal = 0;
+}
+
+static void setup_sigsegv_handler(void)
+{
+ int r, rs;
+ struct sigaction newact;
+ struct sigaction oldact;
+
+ /* #PF is mapped to sigsegv */
+ int signum = SIGSEGV;
+
+ newact.sa_handler = 0;
+ newact.sa_sigaction = signal_handler;
+
+ /*sigset_t - signals to block while in the handler */
+ /* get the old signal mask. */
+ rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+ pkey_assert(rs == 0);
+
+ /* call sa_sigaction, not sa_handler*/
+ newact.sa_flags = SA_SIGINFO;
+
+ newact.sa_restorer = 0; /* void(*)(), obsolete */
+ r = sigaction(signum, &newact, &oldact);
+ r = sigaction(SIGALRM, &newact, &oldact);
+ pkey_assert(r == 0);
+}
+
+static void setup_handlers(void)
+{
+ signal(SIGCHLD, &sig_chld);
+ setup_sigsegv_handler();
+}
+
+static pid_t fork_lazy_child(void)
+{
+ pid_t forkret;
+
+ forkret = fork();
+ pkey_assert(forkret >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+ if (!forkret) {
+ /* in the child */
+ while (1) {
+ dprintf1("child sleeping...\n");
+ sleep(30);
+ }
+ }
+ return forkret;
+}
+
+static int alloc_pkey(void)
+{
+ int ret;
+ unsigned long init_val = PKEY_UNRESTRICTED;
+
+ dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+ __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
+ ret = sys_pkey_alloc(0, init_val);
+ /*
+ * pkey_alloc() sets PKEY register, so we need to reflect it in
+ * shadow_pkey_reg:
+ */
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ if (ret > 0) {
+ /* clear both the bits: */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+ ~PKEY_MASK);
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__,
+ __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ /*
+ * move the new state in from init_val
+ * (remember, we cheated and init_val == pkey_reg format)
+ */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+ init_val);
+ }
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
+ /* for shadow checking: */
+ read_pkey_reg();
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ return ret;
+}
+
+/*
+ * I had a bug where pkey bits could be set by mprotect() but
+ * not cleared. This ensures we get lots of random bit sets
+ * and clears on the vma and pte pkey bits.
+ */
+static int alloc_random_pkey(void)
+{
+ int max_nr_pkey_allocs;
+ int ret;
+ int i;
+ int alloced_pkeys[NR_PKEYS];
+ int nr_alloced = 0;
+ int random_index;
+ memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+
+ /* allocate every possible key and make a note of which ones we got */
+ max_nr_pkey_allocs = NR_PKEYS;
+ for (i = 0; i < max_nr_pkey_allocs; i++) {
+ int new_pkey = alloc_pkey();
+ if (new_pkey < 0)
+ break;
+ alloced_pkeys[nr_alloced++] = new_pkey;
+ }
+
+ pkey_assert(nr_alloced > 0);
+ /* select a random one out of the allocated ones */
+ random_index = rand() % nr_alloced;
+ ret = alloced_pkeys[random_index];
+ /* now zero it out so we don't free it next */
+ alloced_pkeys[random_index] = 0;
+
+ /* go through the allocated ones that we did not want and free them */
+ for (i = 0; i < nr_alloced; i++) {
+ int free_ret;
+ if (!alloced_pkeys[i])
+ continue;
+ free_ret = sys_pkey_free(alloced_pkeys[i]);
+ pkey_assert(!free_ret);
+ }
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n", __func__,
+ __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+ return ret;
+}
+
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int nr_iterations = random() % 100;
+ int ret;
+
+ while (0) {
+ int rpkey = alloc_random_pkey();
+ ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+ dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+ ptr, size, orig_prot, pkey, ret);
+ if (nr_iterations-- < 0)
+ break;
+
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ sys_pkey_free(rpkey);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ }
+ pkey_assert(pkey < NR_PKEYS);
+
+ ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+ dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+ ptr, size, orig_prot, pkey, ret);
+ pkey_assert(!ret);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n", __func__,
+ __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+ return ret;
+}
+
+struct pkey_malloc_record {
+ void *ptr;
+ long size;
+ int prot;
+};
+struct pkey_malloc_record *pkey_malloc_records;
+struct pkey_malloc_record *pkey_last_malloc_record;
+static long nr_pkey_malloc_records;
+void record_pkey_malloc(void *ptr, long size, int prot)
+{
+ long i;
+ struct pkey_malloc_record *rec = NULL;
+
+ for (i = 0; i < nr_pkey_malloc_records; i++) {
+ rec = &pkey_malloc_records[i];
+ /* find a free record */
+ if (rec)
+ break;
+ }
+ if (!rec) {
+ /* every record is full */
+ size_t old_nr_records = nr_pkey_malloc_records;
+ size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
+ size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
+ dprintf2("new_nr_records: %zd\n", new_nr_records);
+ dprintf2("new_size: %zd\n", new_size);
+ pkey_malloc_records = realloc(pkey_malloc_records, new_size);
+ pkey_assert(pkey_malloc_records != NULL);
+ rec = &pkey_malloc_records[nr_pkey_malloc_records];
+ /*
+ * realloc() does not initialize memory, so zero it from
+ * the first new record all the way to the end.
+ */
+ for (i = 0; i < new_nr_records - old_nr_records; i++)
+ memset(rec + i, 0, sizeof(*rec));
+ }
+ dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
+ (int)(rec - pkey_malloc_records), rec, ptr, size);
+ rec->ptr = ptr;
+ rec->size = size;
+ rec->prot = prot;
+ pkey_last_malloc_record = rec;
+ nr_pkey_malloc_records++;
+}
+
+static void free_pkey_malloc(void *ptr)
+{
+ long i;
+ int ret;
+ dprintf3("%s(%p)\n", __func__, ptr);
+ for (i = 0; i < nr_pkey_malloc_records; i++) {
+ struct pkey_malloc_record *rec = &pkey_malloc_records[i];
+ dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
+ ptr, i, rec, rec->ptr, rec->size);
+ if ((ptr < rec->ptr) ||
+ (ptr >= rec->ptr + rec->size))
+ continue;
+
+ dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
+ ptr, i, rec, rec->ptr, rec->size);
+ nr_pkey_malloc_records--;
+ ret = munmap(rec->ptr, rec->size);
+ dprintf3("munmap ret: %d\n", ret);
+ pkey_assert(!ret);
+ dprintf3("clearing rec->ptr, rec: %p\n", rec);
+ rec->ptr = NULL;
+ dprintf3("done clearing rec->ptr, rec: %p\n", rec);
+ return;
+ }
+ pkey_assert(false);
+}
+
+static void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int ret;
+
+ read_pkey_reg();
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+ pkey_assert(!ret);
+ record_pkey_malloc(ptr, size, prot);
+ read_pkey_reg();
+
+ dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+ return ptr;
+}
+
+static void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+{
+ int ret;
+ void *ptr;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ /*
+ * Guarantee we can fit at least one huge page in the resulting
+ * allocation by allocating space for 2:
+ */
+ size = ALIGN_UP(size, HPAGE_SIZE * 2);
+ ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ record_pkey_malloc(ptr, size, prot);
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ dprintf1("unaligned ptr: %p\n", ptr);
+ ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
+ dprintf1(" aligned ptr: %p\n", ptr);
+ ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
+ dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
+ ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
+ dprintf1("MADV_WILLNEED ret: %d\n", ret);
+ memset(ptr, 0, HPAGE_SIZE);
+
+ dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
+ return ptr;
+}
+
+static int hugetlb_setup_ok;
+#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
+#define GET_NR_HUGE_PAGES 10
+static void setup_hugetlbfs(void)
+{
+ int err;
+ int fd;
+ char buf[256];
+ long hpagesz_kb;
+ long hpagesz_mb;
+
+ if (geteuid() != 0) {
+ fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
+ return;
+ }
+
+ cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
+
+ /*
+ * Now go make sure that we got the pages and that they
+ * are PMD-level pages. Someone might have made PUD-level
+ * pages the default.
+ */
+ hpagesz_kb = HPAGE_SIZE / 1024;
+ hpagesz_mb = hpagesz_kb / 1024;
+ sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
+ fd = open(buf, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
+ hpagesz_mb, strerror(errno));
+ return;
+ }
+
+ /* -1 to guarantee leaving the trailing \0 */
+ err = read(fd, buf, sizeof(buf)-1);
+ close(fd);
+ if (err <= 0) {
+ fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
+ hpagesz_mb, strerror(errno));
+ return;
+ }
+
+ if (atoi(buf) != GET_NR_HUGE_PAGES) {
+ fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
+ hpagesz_mb, buf, GET_NR_HUGE_PAGES);
+ return;
+ }
+
+ hugetlb_setup_ok = 1;
+}
+
+static void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
+
+ if (!hugetlb_setup_ok)
+ return PTR_ERR_ENOTSUP;
+
+ dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
+ size = ALIGN_UP(size, HPAGE_SIZE * 2);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
+ return ptr;
+}
+
+static void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+
+ malloc_pkey_with_mprotect,
+ malloc_pkey_with_mprotect_subpage,
+ malloc_pkey_anon_huge,
+ malloc_pkey_hugetlb
+};
+
+static void *malloc_pkey(long size, int prot, u16 pkey)
+{
+ void *ret;
+ static int malloc_type;
+ int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
+
+ pkey_assert(pkey < NR_PKEYS);
+
+ while (1) {
+ pkey_assert(malloc_type < nr_malloc_types);
+
+ ret = pkey_malloc[malloc_type](size, prot, pkey);
+ pkey_assert(ret != (void *)-1);
+
+ malloc_type++;
+ if (malloc_type >= nr_malloc_types)
+ malloc_type = (random()%nr_malloc_types);
+
+ /* try again if the malloc_type we tried is unsupported */
+ if (ret == PTR_ERR_ENOTSUP)
+ continue;
+
+ break;
+ }
+
+ dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
+ size, prot, pkey, ret);
+ return ret;
+}
+
+static int last_pkey_faults;
+#define UNKNOWN_PKEY -2
+void expected_pkey_fault(int pkey)
+{
+ dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
+ __func__, last_pkey_faults, pkey_faults);
+ dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
+ pkey_assert(last_pkey_faults + 1 == pkey_faults);
+
+ /*
+ * For exec-only memory, we do not know the pkey in
+ * advance, so skip this check.
+ */
+ if (pkey != UNKNOWN_PKEY)
+ pkey_assert(last_si_pkey == pkey);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ /*
+ * The signal handler shold have cleared out PKEY register to let the
+ * test program continue. We now have to restore it.
+ */
+ if (__read_pkey_reg() != 0)
+#elif defined(__aarch64__)
+ if (__read_pkey_reg() != PKEY_REG_ALLOW_ALL)
+#else
+ if (__read_pkey_reg() != shadow_pkey_reg)
+#endif /* arch */
+ pkey_assert(0);
+
+ __write_pkey_reg(shadow_pkey_reg);
+ dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
+ "nuked it\n", __func__, shadow_pkey_reg);
+ last_pkey_faults = pkey_faults;
+ last_si_pkey = -1;
+}
+
+#define do_not_expect_pkey_fault(msg) do { \
+ if (last_pkey_faults != pkey_faults) \
+ dprintf0("unexpected PKey fault: %s\n", msg); \
+ pkey_assert(last_pkey_faults == pkey_faults); \
+} while (0)
+
+static int test_fds[10] = { -1 };
+static int nr_test_fds;
+static void __save_test_fd(int fd)
+{
+ pkey_assert(fd >= 0);
+ pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
+ test_fds[nr_test_fds] = fd;
+ nr_test_fds++;
+}
+
+static int get_test_read_fd(void)
+{
+ int test_fd = open("/etc/passwd", O_RDONLY);
+ __save_test_fd(test_fd);
+ return test_fd;
+}
+
+static void close_test_fds(void)
+{
+ int i;
+
+ for (i = 0; i < nr_test_fds; i++) {
+ if (test_fds[i] < 0)
+ continue;
+ close(test_fds[i]);
+ test_fds[i] = -1;
+ }
+ nr_test_fds = 0;
+}
+
+static void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+ int i, err;
+ int max_nr_pkey_allocs;
+ int alloced_pkeys[NR_PKEYS];
+ int nr_alloced = 0;
+ long size;
+
+ pkey_assert(pkey_last_malloc_record);
+ size = pkey_last_malloc_record->size;
+ /*
+ * This is a bit of a hack. But mprotect() requires
+ * huge-page-aligned sizes when operating on hugetlbfs.
+ * So, make sure that we use something that's a multiple
+ * of a huge page when we can.
+ */
+ if (size >= HPAGE_SIZE)
+ size = HPAGE_SIZE;
+
+ /* allocate every possible key and make sure key-0 never got allocated */
+ max_nr_pkey_allocs = NR_PKEYS;
+ for (i = 0; i < max_nr_pkey_allocs; i++) {
+ int new_pkey = alloc_pkey();
+ pkey_assert(new_pkey != 0);
+
+ if (new_pkey < 0)
+ break;
+ alloced_pkeys[nr_alloced++] = new_pkey;
+ }
+ /* free all the allocated keys */
+ for (i = 0; i < nr_alloced; i++) {
+ int free_ret;
+
+ if (!alloced_pkeys[i])
+ continue;
+ free_ret = sys_pkey_free(alloced_pkeys[i]);
+ pkey_assert(!free_ret);
+ }
+
+ /* attach key-0 in various modes */
+ err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+ pkey_assert(!err);
+}
+
+static void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling write access to PKEY[1], doing read\n");
+ pkey_write_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ dprintf1("\n");
+}
+static void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
+ read_pkey_reg();
+ pkey_access_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+static void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+ pkey, ptr);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("reading ptr before disabling the read : %d\n",
+ ptr_contents);
+ read_pkey_reg();
+ pkey_access_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+static void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ *ptr = __LINE__;
+ dprintf1("disabling write access; after accessing the page, "
+ "to PKEY[%02d], doing write\n", pkey);
+ pkey_write_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
+static void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
+ pkey_write_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+static void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
+ pkey_access_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
+static void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ *ptr = __LINE__;
+ dprintf1("disabling access; after accessing the page, "
+ " to PKEY[%02d], doing write\n", pkey);
+ pkey_access_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
+static void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int ret;
+ int test_fd = get_test_read_fd();
+
+ dprintf1("disabling access to PKEY[%02d], "
+ "having kernel read() to buffer\n", pkey);
+ pkey_access_deny(pkey);
+ ret = read(test_fd, ptr, 1);
+ dprintf1("read ret: %d\n", ret);
+ pkey_assert(ret);
+}
+
+static void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ret;
+ int test_fd = get_test_read_fd();
+
+ pkey_write_deny(pkey);
+ ret = read(test_fd, ptr, 100);
+ dprintf1("read ret: %d\n", ret);
+ if (ret < 0 && (DEBUG_LEVEL > 0))
+ perror("verbose read result (OK for this to be bad)");
+ pkey_assert(ret);
+}
+
+static void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int pipe_ret, vmsplice_ret;
+ struct iovec iov;
+ int pipe_fds[2];
+
+ pipe_ret = pipe(pipe_fds);
+
+ pkey_assert(pipe_ret == 0);
+ dprintf1("disabling access to PKEY[%02d], "
+ "having kernel vmsplice from buffer\n", pkey);
+ pkey_access_deny(pkey);
+ iov.iov_base = ptr;
+ iov.iov_len = PAGE_SIZE;
+ vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
+ dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
+ pkey_assert(vmsplice_ret == -1);
+
+ close(pipe_fds[0]);
+ close(pipe_fds[1]);
+}
+
+static void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ignored = 0xdada;
+ int futex_ret;
+ int some_int = __LINE__;
+
+ dprintf1("disabling write to PKEY[%02d], "
+ "doing futex gunk in buffer\n", pkey);
+ *ptr = some_int;
+ pkey_write_deny(pkey);
+ futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
+ &ignored, ignored);
+ if (DEBUG_LEVEL > 0)
+ perror("futex");
+ dprintf1("futex() ret: %d\n", futex_ret);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+static void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+{
+ int err;
+ int i;
+
+ /* Note: 0 is the default pkey, so don't mess with it */
+ for (i = 1; i < NR_PKEYS; i++) {
+ if (pkey == i)
+ continue;
+
+ dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
+ err = sys_pkey_free(i);
+ pkey_assert(err);
+
+ err = sys_pkey_free(i);
+ pkey_assert(err);
+
+ err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
+ pkey_assert(err);
+ }
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+static void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+{
+ int err;
+ int bad_pkey = NR_PKEYS+99;
+
+ /* pass a known-invalid pkey in: */
+ err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
+ pkey_assert(err);
+}
+
+static void become_child(void)
+{
+ pid_t forkret;
+
+ forkret = fork();
+ pkey_assert(forkret >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+ if (!forkret) {
+ /* in the child */
+ return;
+ }
+ exit(0);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+static void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+{
+ int err;
+ int allocated_pkeys[NR_PKEYS] = {0};
+ int nr_allocated_pkeys = 0;
+ int i;
+
+ for (i = 0; i < NR_PKEYS*3; i++) {
+ int new_pkey;
+ dprintf1("%s() alloc loop: %d\n", __func__, i);
+ new_pkey = alloc_pkey();
+ dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, err, __read_pkey_reg(),
+ shadow_pkey_reg);
+ read_pkey_reg(); /* for shadow checking */
+ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
+ if ((new_pkey == -1) && (errno == ENOSPC)) {
+ dprintf2("%s() failed to allocate pkey after %d tries\n",
+ __func__, nr_allocated_pkeys);
+ } else {
+ /*
+ * Ensure the number of successes never
+ * exceeds the number of keys supported
+ * in the hardware.
+ */
+ pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+ allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+ }
+
+ /*
+ * Make sure that allocation state is properly
+ * preserved across fork().
+ */
+ if (i == NR_PKEYS*2)
+ become_child();
+ }
+
+ dprintf3("%s()::%d\n", __func__, __LINE__);
+
+ /*
+ * On x86:
+ * There are 16 pkeys supported in hardware. Three are
+ * allocated by the time we get here:
+ * 1. The default key (0)
+ * 2. One possibly consumed by an execute-only mapping.
+ * 3. One allocated by the test code and passed in via
+ * 'pkey' to this function.
+ * Ensure that we can allocate at least another 13 (16-3).
+ *
+ * On powerpc:
+ * There are either 5, 28, 29 or 32 pkeys supported in
+ * hardware depending on the page size (4K or 64K) and
+ * platform (powernv or powervm). Four are allocated by
+ * the time we get here. These include pkey-0, pkey-1,
+ * exec-only pkey and the one allocated by the test code.
+ * Ensure that we can allocate the remaining.
+ */
+ pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
+
+ for (i = 0; i < nr_allocated_pkeys; i++) {
+ err = sys_pkey_free(allocated_pkeys[i]);
+ pkey_assert(!err);
+ read_pkey_reg(); /* for shadow checking */
+ }
+}
+
+static void arch_force_pkey_reg_init(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ u64 *buf;
+
+ /*
+ * All keys should be allocated and set to allow reads and
+ * writes, so the register should be all 0. If not, just
+ * skip the test.
+ */
+ if (read_pkey_reg())
+ return;
+
+ /*
+ * Just allocate an absurd about of memory rather than
+ * doing the XSAVE size enumeration dance.
+ */
+ buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+ /* These __builtins require compiling with -mxsave */
+
+ /* XSAVE to build a valid buffer: */
+ __builtin_ia32_xsave(buf, XSTATE_PKEY);
+ /* Clear XSTATE_BV[PKRU]: */
+ buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
+ /* XRSTOR will likely get PKRU back to the init state: */
+ __builtin_ia32_xrstor(buf, XSTATE_PKEY);
+
+ munmap(buf, 1*MB);
+#endif
+}
+
+
+/*
+ * This is mostly useless on ppc for now. But it will not
+ * hurt anything and should give some better coverage as
+ * a long-running test that continually checks the pkey
+ * register.
+ */
+static void test_pkey_init_state(int *ptr, u16 pkey)
+{
+ int err;
+ int allocated_pkeys[NR_PKEYS] = {0};
+ int nr_allocated_pkeys = 0;
+ int i;
+
+ for (i = 0; i < NR_PKEYS; i++) {
+ int new_pkey = alloc_pkey();
+
+ if (new_pkey < 0)
+ continue;
+ allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+ }
+
+ dprintf3("%s()::%d\n", __func__, __LINE__);
+
+ arch_force_pkey_reg_init();
+
+ /*
+ * Loop for a bit, hoping to get exercise the kernel
+ * context switch code.
+ */
+ for (i = 0; i < 1000000; i++)
+ read_pkey_reg();
+
+ for (i = 0; i < nr_allocated_pkeys; i++) {
+ err = sys_pkey_free(allocated_pkeys[i]);
+ pkey_assert(!err);
+ read_pkey_reg(); /* for shadow checking */
+ }
+}
+
+/*
+ * pkey 0 is special. It is allocated by default, so you do not
+ * have to call pkey_alloc() to use it first. Make sure that it
+ * is usable.
+ */
+static void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+{
+ long size;
+ int prot;
+
+ assert(pkey_last_malloc_record);
+ size = pkey_last_malloc_record->size;
+ /*
+ * This is a bit of a hack. But mprotect() requires
+ * huge-page-aligned sizes when operating on hugetlbfs.
+ * So, make sure that we use something that's a multiple
+ * of a huge page when we can.
+ */
+ if (size >= HPAGE_SIZE)
+ size = HPAGE_SIZE;
+ prot = pkey_last_malloc_record->prot;
+
+ /* Use pkey 0 */
+ mprotect_pkey(ptr, size, prot, 0);
+
+ /* Make sure that we can set it back to the original pkey. */
+ mprotect_pkey(ptr, size, prot, pkey);
+}
+
+static void test_ptrace_of_child(int *ptr, u16 pkey)
+{
+ __attribute__((__unused__)) int peek_result;
+ pid_t child_pid;
+ void *ignored = 0;
+ long ret;
+ int status;
+ /*
+ * This is the "control" for our little expermient. Make sure
+ * we can always access it when ptracing.
+ */
+ int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
+ int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
+
+ /*
+ * Fork a child which is an exact copy of this process, of course.
+ * That means we can do all of our tests via ptrace() and then plain
+ * memory access and ensure they work differently.
+ */
+ child_pid = fork_lazy_child();
+ dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
+
+ ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
+ if (ret)
+ perror("attach");
+ dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
+ pkey_assert(ret != -1);
+ ret = waitpid(child_pid, &status, WUNTRACED);
+ if ((ret != child_pid) || !(WIFSTOPPED(status))) {
+ fprintf(stderr, "weird waitpid result %ld stat %x\n",
+ ret, status);
+ pkey_assert(0);
+ }
+ dprintf2("waitpid ret: %ld\n", ret);
+ dprintf2("waitpid status: %d\n", status);
+
+ pkey_access_deny(pkey);
+ pkey_write_deny(pkey);
+
+ /* Write access, untested for now:
+ ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
+ pkey_assert(ret != -1);
+ dprintf1("poke at %p: %ld\n", peek_at, ret);
+ */
+
+ /*
+ * Try to access the pkey-protected "ptr" via ptrace:
+ */
+ ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
+ /* expect it to work, without an error: */
+ pkey_assert(ret != -1);
+ /* Now access from the current task, and expect an exception: */
+ peek_result = read_ptr(ptr);
+ expected_pkey_fault(pkey);
+
+ /*
+ * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
+ */
+ ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
+ /* expect it to work, without an error: */
+ pkey_assert(ret != -1);
+ /* Now access from the current task, and expect NO exception: */
+ peek_result = read_ptr(plain_ptr);
+ do_not_expect_pkey_fault("read plain pointer after ptrace");
+
+ ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
+ pkey_assert(ret != -1);
+
+ ret = kill(child_pid, SIGKILL);
+ pkey_assert(ret != -1);
+
+ wait(&status);
+
+ free(plain_ptr_unaligned);
+}
+
+static void *get_pointer_to_instructions(void)
+{
+ void *p1;
+
+ p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
+ dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
+ /* lots_o_noops_around_write should be page-aligned already */
+ assert(p1 == &lots_o_noops_around_write);
+
+ /* Point 'p1' at the *second* page of the function: */
+ p1 += PAGE_SIZE;
+
+ /*
+ * Try to ensure we fault this in on next touch to ensure
+ * we get an instruction fault as opposed to a data one
+ */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+
+ return p1;
+}
+
+static void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+ void *p1;
+ int scratch;
+ int ptr_contents;
+ int ret;
+
+ p1 = get_pointer_to_instructions();
+ lots_o_noops_around_write(&scratch);
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+ ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
+ pkey_assert(!ret);
+ pkey_access_deny(pkey);
+
+ dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+ /*
+ * Make sure this is an *instruction* fault
+ */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+ lots_o_noops_around_write(&scratch);
+ do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+ expect_fault_on_read_execonly_key(p1, pkey);
+
+ // Reset back to PROT_EXEC | PROT_READ for architectures that support
+ // non-PKEY execute-only permissions.
+ ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC | PROT_READ, (u64)pkey);
+ pkey_assert(!ret);
+}
+
+static void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+{
+ void *p1;
+ int scratch;
+ int ptr_contents;
+ int ret;
+
+ dprintf1("%s() start\n", __func__);
+
+ p1 = get_pointer_to_instructions();
+ lots_o_noops_around_write(&scratch);
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+ /* Use a *normal* mprotect(), not mprotect_pkey(): */
+ ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
+ pkey_assert(!ret);
+
+ /*
+ * Reset the shadow, assuming that the above mprotect()
+ * correctly changed PKRU, but to an unknown value since
+ * the actual allocated pkey is unknown.
+ */
+ shadow_pkey_reg = __read_pkey_reg();
+
+ dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+ /* Make sure this is an *instruction* fault */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+ lots_o_noops_around_write(&scratch);
+ do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+ expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
+
+ /*
+ * Put the memory back to non-PROT_EXEC. Should clear the
+ * exec-only pkey off the VMA and allow it to be readable
+ * again. Go to PROT_NONE first to check for a kernel bug
+ * that did not clear the pkey when doing PROT_NONE.
+ */
+ ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
+ pkey_assert(!ret);
+
+ ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
+ pkey_assert(!ret);
+ ptr_contents = read_ptr(p1);
+ do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+{
+ u32 new_pkru;
+ pid_t child;
+ int status, ret;
+ int pkey_offset = pkey_reg_xstate_offset();
+ size_t xsave_size = cpu_max_xsave_size();
+ void *xsave;
+ u32 *pkey_register;
+ u64 *xstate_bv;
+ struct iovec iov;
+
+ new_pkru = ~read_pkey_reg();
+ /* Don't make PROT_EXEC mappings inaccessible */
+ new_pkru &= ~3;
+
+ child = fork();
+ pkey_assert(child >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), child);
+ if (!child) {
+ ptrace(PTRACE_TRACEME, 0, 0, 0);
+ /* Stop and allow the tracer to modify PKRU directly */
+ raise(SIGSTOP);
+
+ /*
+ * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+ * checking
+ */
+ if (__read_pkey_reg() != new_pkru)
+ exit(1);
+
+ /* Stop and allow the tracer to clear XSTATE_BV for PKRU */
+ raise(SIGSTOP);
+
+ if (__read_pkey_reg() != 0)
+ exit(1);
+
+ /* Stop and allow the tracer to examine PKRU */
+ raise(SIGSTOP);
+
+ exit(0);
+ }
+
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ xsave = (void *)malloc(xsave_size);
+ pkey_assert(xsave > 0);
+
+ /* Modify the PKRU register directly */
+ iov.iov_base = xsave;
+ iov.iov_len = xsave_size;
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+
+ pkey_register = (u32 *)(xsave + pkey_offset);
+ pkey_assert(*pkey_register == read_pkey_reg());
+
+ *pkey_register = new_pkru;
+
+ ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+
+ /* Test that the modification is visible in ptrace before any execution */
+ memset(xsave, 0xCC, xsave_size);
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(*pkey_register == new_pkru);
+
+ /* Execute the tracee */
+ ret = ptrace(PTRACE_CONT, child, 0, 0);
+ pkey_assert(ret == 0);
+
+ /* Test that the tracee saw the PKRU value change */
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ /* Test that the modification is visible in ptrace after execution */
+ memset(xsave, 0xCC, xsave_size);
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(*pkey_register == new_pkru);
+
+ /* Clear the PKRU bit from XSTATE_BV */
+ xstate_bv = (u64 *)(xsave + 512);
+ *xstate_bv &= ~(1 << 9);
+
+ ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+
+ /* Test that the modification is visible in ptrace before any execution */
+ memset(xsave, 0xCC, xsave_size);
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(*pkey_register == 0);
+
+ ret = ptrace(PTRACE_CONT, child, 0, 0);
+ pkey_assert(ret == 0);
+
+ /* Test that the tracee saw the PKRU value go to 0 */
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ /* Test that the modification is visible in ptrace after execution */
+ memset(xsave, 0xCC, xsave_size);
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(*pkey_register == 0);
+
+ ret = ptrace(PTRACE_CONT, child, 0, 0);
+ pkey_assert(ret == 0);
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFEXITED(status));
+ pkey_assert(WEXITSTATUS(status) == 0);
+ free(xsave);
+}
+#endif
+
+#if defined(__aarch64__)
+static void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+{
+ pid_t child;
+ int status, ret;
+ struct iovec iov;
+ u64 trace_pkey;
+ /* Just a random pkey value.. */
+ u64 new_pkey = (POE_X << PKEY_BITS_PER_PKEY * 2) |
+ (POE_NONE << PKEY_BITS_PER_PKEY) |
+ POE_RWX;
+
+ child = fork();
+ pkey_assert(child >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), child);
+ if (!child) {
+ ptrace(PTRACE_TRACEME, 0, 0, 0);
+
+ /* Stop and allow the tracer to modify PKRU directly */
+ raise(SIGSTOP);
+
+ /*
+ * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+ * checking
+ */
+ if (__read_pkey_reg() != new_pkey)
+ exit(1);
+
+ raise(SIGSTOP);
+
+ exit(0);
+ }
+
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ iov.iov_base = &trace_pkey;
+ iov.iov_len = 8;
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(trace_pkey == read_pkey_reg());
+
+ trace_pkey = new_pkey;
+
+ ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_ARM_POE, &iov);
+ pkey_assert(ret == 0);
+
+ /* Test that the modification is visible in ptrace before any execution */
+ memset(&trace_pkey, 0, sizeof(trace_pkey));
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(trace_pkey == new_pkey);
+
+ /* Execute the tracee */
+ ret = ptrace(PTRACE_CONT, child, 0, 0);
+ pkey_assert(ret == 0);
+
+ /* Test that the tracee saw the PKRU value change */
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ /* Test that the modification is visible in ptrace after execution */
+ memset(&trace_pkey, 0, sizeof(trace_pkey));
+ ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_ARM_POE, &iov);
+ pkey_assert(ret == 0);
+ pkey_assert(trace_pkey == new_pkey);
+
+ ret = ptrace(PTRACE_CONT, child, 0, 0);
+ pkey_assert(ret == 0);
+ pkey_assert(child == waitpid(child, &status, 0));
+ dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+ pkey_assert(WIFEXITED(status));
+ pkey_assert(WEXITSTATUS(status) == 0);
+}
+#endif
+
+static void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+{
+ int size = PAGE_SIZE;
+ int sret;
+
+ if (cpu_has_pkeys()) {
+ dprintf1("SKIP: %s: no CPU support\n", __func__);
+ return;
+ }
+
+ sret = syscall(__NR_pkey_mprotect, ptr, size, PROT_READ, pkey);
+ pkey_assert(sret < 0);
+}
+
+static void (*pkey_tests[])(int *ptr, u16 pkey) = {
+ test_read_of_write_disabled_region,
+ test_read_of_access_disabled_region,
+ test_read_of_access_disabled_region_with_page_already_mapped,
+ test_write_of_write_disabled_region,
+ test_write_of_write_disabled_region_with_page_already_mapped,
+ test_write_of_access_disabled_region,
+ test_write_of_access_disabled_region_with_page_already_mapped,
+ test_kernel_write_of_access_disabled_region,
+ test_kernel_write_of_write_disabled_region,
+ test_kernel_gup_of_access_disabled_region,
+ test_kernel_gup_write_to_write_disabled_region,
+ test_executing_on_unreadable_memory,
+ test_implicit_mprotect_exec_only_memory,
+ test_mprotect_with_pkey_0,
+ test_ptrace_of_child,
+ test_pkey_init_state,
+ test_pkey_syscalls_on_non_allocated_pkey,
+ test_pkey_syscalls_bad_args,
+ test_pkey_alloc_exhaust,
+ test_pkey_alloc_free_attach_pkey0,
+#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
+ test_ptrace_modifies_pkru,
+#endif
+};
+
+static void run_tests_once(void)
+{
+ int *ptr;
+ int prot = PROT_READ|PROT_WRITE;
+
+ for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
+ int pkey;
+ int orig_pkey_faults = pkey_faults;
+
+ dprintf1("======================\n");
+ dprintf1("test %d preparing...\n", test_nr);
+
+ tracing_on();
+ pkey = alloc_random_pkey();
+ dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
+ ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
+ dprintf1("test %d starting...\n", test_nr);
+ pkey_tests[test_nr](ptr, pkey);
+ dprintf1("freeing test memory: %p\n", ptr);
+ free_pkey_malloc(ptr);
+ sys_pkey_free(pkey);
+
+ dprintf1("pkey_faults: %d\n", pkey_faults);
+ dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
+
+ tracing_off();
+ close_test_fds();
+
+ printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
+ dprintf1("======================\n\n");
+ }
+ iteration_nr++;
+}
+
+static void pkey_setup_shadow(void)
+{
+ shadow_pkey_reg = __read_pkey_reg();
+}
+
+int main(void)
+{
+ int nr_iterations = 22;
+ int pkeys_supported = is_pkeys_supported();
+
+ srand((unsigned int)time(NULL));
+
+ setup_handlers();
+
+ printf("has pkeys: %d\n", pkeys_supported);
+
+ if (!pkeys_supported) {
+ int size = PAGE_SIZE;
+ int *ptr;
+
+ printf("running PKEY tests for unsupported CPU/OS\n");
+
+ ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ assert(ptr != (void *)-1);
+ test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
+ exit(0);
+ }
+
+ pkey_setup_shadow();
+ printf("startup pkey_reg: %016llx\n", read_pkey_reg());
+ setup_hugetlbfs();
+
+ while (nr_iterations-- > 0)
+ run_tests_once();
+
+ printf("done (all tests OK)\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
new file mode 100755
index 000000000000..dddd1dd8af14
--- /dev/null
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -0,0 +1,527 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+count_total=0
+count_pass=0
+count_fail=0
+count_skip=0
+exitcode=0
+
+usage() {
+ cat <<EOF
+usage: ${BASH_SOURCE[0]:-$0} [ options ]
+
+ -a: run all tests, including extra ones (other than destructive ones)
+ -t: specify specific categories to tests to run
+ -h: display this message
+ -n: disable TAP output
+ -d: run destructive tests
+
+The default behavior is to run required tests only. If -a is specified,
+will run all tests.
+
+Alternatively, specific groups tests can be run by passing a string
+to the -t argument containing one or more of the following categories
+separated by spaces:
+- mmap
+ tests for mmap(2)
+- gup_test
+ tests for gup
+- userfaultfd
+ tests for userfaultfd(2)
+- compaction
+ a test for the patch "Allow compaction of unevictable pages"
+- mlock
+ tests for mlock(2)
+- mremap
+ tests for mremap(2)
+- hugevm
+ tests for very large virtual address space
+- vmalloc
+ vmalloc smoke tests
+- hmm
+ hmm smoke tests
+- madv_guard
+ test madvise(2) MADV_GUARD_INSTALL and MADV_GUARD_REMOVE options
+- madv_populate
+ test memadvise(2) MADV_POPULATE_{READ,WRITE} options
+- memfd_secret
+ test memfd_secret(2)
+- process_mrelease
+ test process_mrelease(2)
+- ksm
+ ksm tests that do not require >=2 NUMA nodes
+- ksm_numa
+ ksm tests that require >=2 NUMA nodes
+- pkey
+ memory protection key tests
+- soft_dirty
+ test soft dirty page bit semantics
+- pagemap
+ test pagemap_scan IOCTL
+- pfnmap
+ tests for VM_PFNMAP handling
+- cow
+ test copy-on-write semantics
+- thp
+ test transparent huge pages
+- hugetlb
+ test hugetlbfs huge pages
+- migration
+ invoke move_pages(2) to exercise the migration entry code
+ paths in the kernel
+- mkdirty
+ test handling of code that might set PTE/PMD dirty in
+ read-only VMAs
+- mdwe
+ test prctl(PR_SET_MDWE, ...)
+- page_frag
+ test handling of page fragment allocation and freeing
+- vma_merge
+ test VMA merge cases behave as expected
+
+example: ./run_vmtests.sh -t "hmm mmap ksm"
+EOF
+ exit 0
+}
+
+RUN_ALL=false
+RUN_DESTRUCTIVE=false
+TAP_PREFIX="# "
+
+while getopts "aht:n" OPT; do
+ case ${OPT} in
+ "a") RUN_ALL=true ;;
+ "h") usage ;;
+ "t") VM_SELFTEST_ITEMS=${OPTARG} ;;
+ "n") TAP_PREFIX= ;;
+ "d") RUN_DESTRUCTIVE=true ;;
+ esac
+done
+shift $((OPTIND -1))
+
+# default behavior: run all tests
+VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
+
+test_selected() {
+ if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
+ # If no VM_SELFTEST_ITEMS are specified, run all tests
+ return 0
+ fi
+ # If test selected argument is one of the test items
+ if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+run_gup_matrix() {
+ # -t: thp=on, -T: thp=off, -H: hugetlb=on
+ local hugetlb_mb=$(( needmem_KB / 1024 ))
+
+ for huge in -t -T "-H -m $hugetlb_mb"; do
+ # -u: gup-fast, -U: gup-basic, -a: pin-fast, -b: pin-basic, -L: pin-longterm
+ for test_cmd in -u -U -a -b -L; do
+ # -w: write=1, -W: write=0
+ for write in -w -W; do
+ # -S: shared
+ for share in -S " "; do
+ # -n: How many pages to fetch together? 512 is special
+ # because it's default thp size (or 2M on x86), 123 to
+ # just test partial gup when hit a huge in whatever form
+ for num in "-n 1" "-n 512" "-n 123"; do
+ CATEGORY="gup_test" run_test ./gup_test \
+ $huge $test_cmd $write $share $num
+ done
+ done
+ done
+ done
+ done
+}
+
+# get huge pagesize and freepages from /proc/meminfo
+while read -r name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs="$size"
+ fi
+ if [ "$name" = "Hugepagesize:" ]; then
+ hpgsize_KB="$size"
+ fi
+done < /proc/meminfo
+
+# Simple hugetlbfs tests have a hardcoded minimum requirement of
+# huge pages totaling 256MB (262144KB) in size. The userfaultfd
+# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take
+# both of these requirements into account and attempt to increase
+# number of huge pages available.
+nr_cpus=$(nproc)
+uffd_min_KB=$((hpgsize_KB * nr_cpus * 2))
+hugetlb_min_KB=$((256 * 1024))
+if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then
+ needmem_KB=$uffd_min_KB
+else
+ needmem_KB=$hugetlb_min_KB
+fi
+
+# set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
+ nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+ needpgs=$((needmem_KB / hpgsize_KB))
+ tries=2
+ while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do
+ lackpgs=$((needpgs - freepgs))
+ echo 3 > /proc/sys/vm/drop_caches
+ if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then
+ echo "Please run this test as root"
+ exit $ksft_skip
+ fi
+ while read -r name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+ done < /proc/meminfo
+ tries=$((tries - 1))
+ done
+ if [ "$freepgs" -lt "$needpgs" ]; then
+ printf "Not enough huge pages available (%d < %d)\n" \
+ "$freepgs" "$needpgs"
+ fi
+ HAVE_HUGEPAGES=1
+else
+ echo "no hugetlbfs support in kernel?"
+ HAVE_HUGEPAGES=0
+fi
+
+# filter 64bit architectures
+ARCH64STR="arm64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64"
+if [ -z "$ARCH" ]; then
+ ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
+fi
+VADDR64=0
+echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
+
+tap_prefix() {
+ sed -e "s/^/${TAP_PREFIX}/"
+}
+
+tap_output() {
+ if [[ ! -z "$TAP_PREFIX" ]]; then
+ read str
+ echo $str
+ fi
+}
+
+pretty_name() {
+ echo "$*" | sed -e 's/^\(bash \)\?\.\///'
+}
+
+# Usage: run_test [test binary] [arbitrary test arguments...]
+run_test() {
+ if test_selected ${CATEGORY}; then
+ local skip=0
+
+ # On memory constrainted systems some tests can fail to allocate hugepages.
+ # perform some cleanup before the test for a higher success rate.
+ if [ ${CATEGORY} == "thp" -o ${CATEGORY} == "hugetlb" ]; then
+ if [ "${HAVE_HUGEPAGES}" = "1" ]; then
+ echo 3 > /proc/sys/vm/drop_caches
+ sleep 2
+ echo 1 > /proc/sys/vm/compact_memory
+ sleep 2
+ else
+ echo "hugepages not supported" | tap_prefix
+ skip=1
+ fi
+ fi
+
+ local test=$(pretty_name "$*")
+ local title="running $*"
+ local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
+ printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" | tap_prefix
+
+ if [ "${skip}" != "1" ]; then
+ ("$@" 2>&1) | tap_prefix
+ local ret=${PIPESTATUS[0]}
+ else
+ local ret=$ksft_skip
+ fi
+ count_total=$(( count_total + 1 ))
+ if [ $ret -eq 0 ]; then
+ count_pass=$(( count_pass + 1 ))
+ echo "[PASS]" | tap_prefix
+ echo "ok ${count_total} ${test}" | tap_output
+ elif [ $ret -eq $ksft_skip ]; then
+ count_skip=$(( count_skip + 1 ))
+ echo "[SKIP]" | tap_prefix
+ echo "ok ${count_total} ${test} # SKIP" | tap_output
+ exitcode=$ksft_skip
+ else
+ count_fail=$(( count_fail + 1 ))
+ echo "[FAIL]" | tap_prefix
+ echo "not ok ${count_total} ${test} # exit=$ret" | tap_output
+ exitcode=1
+ fi
+ fi # test_selected
+}
+
+echo "TAP version 13" | tap_output
+
+CATEGORY="hugetlb" run_test ./hugepage-mmap
+
+shmmax=$(cat /proc/sys/kernel/shmmax)
+shmall=$(cat /proc/sys/kernel/shmall)
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+CATEGORY="hugetlb" run_test ./hugepage-shm
+echo "$shmmax" > /proc/sys/kernel/shmmax
+echo "$shmall" > /proc/sys/kernel/shmall
+
+CATEGORY="hugetlb" run_test ./map_hugetlb
+CATEGORY="hugetlb" run_test ./hugepage-mremap
+CATEGORY="hugetlb" run_test ./hugepage-vmemmap
+CATEGORY="hugetlb" run_test ./hugetlb-madvise
+CATEGORY="hugetlb" run_test ./hugetlb_dio
+
+if [ "${HAVE_HUGEPAGES}" = "1" ]; then
+ nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
+ # For this test, we need one and just one huge page
+ echo 1 > /proc/sys/vm/nr_hugepages
+ CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+ CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map
+ # Restore the previous number of huge pages, since further tests rely on it
+ echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
+fi
+
+if test_selected "hugetlb"; then
+ echo "NOTE: These hugetlb tests provide minimal coverage. Use" | tap_prefix
+ echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" | tap_prefix
+ echo " hugetlb regression testing." | tap_prefix
+fi
+
+CATEGORY="mmap" run_test ./map_fixed_noreplace
+
+if $RUN_ALL; then
+ run_gup_matrix
+else
+ # get_user_pages_fast() benchmark
+ CATEGORY="gup_test" run_test ./gup_test -u
+ # pin_user_pages_fast() benchmark
+ CATEGORY="gup_test" run_test ./gup_test -a
+fi
+# Dump pages 0, 19, and 4096, using pin_user_pages:
+CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
+CATEGORY="gup_test" run_test ./gup_longterm
+
+CATEGORY="userfaultfd" run_test ./uffd-unit-tests
+uffd_stress_bin=./uffd-stress
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16
+# Hugetlb tests require source and destination huge pages. Pass in half
+# the size of the free pages we have, which is used for *each*.
+# uffd-stress expects a region expressed in MiB, so we adjust
+# half_ufd_size_MB accordingly.
+half_ufd_size_MB=$(((freepgs * hpgsize_KB) / 1024 / 2))
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16
+CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16
+# uffd-wp-mremap requires at least one page of each size.
+have_all_size_hugepgs=true
+declare -A nr_size_hugepgs
+for f in /sys/kernel/mm/hugepages/**/nr_hugepages; do
+ old=$(cat $f)
+ nr_size_hugepgs["$f"]="$old"
+ if [ "$old" == 0 ]; then
+ echo 1 > "$f"
+ fi
+ if [ $(cat "$f") == 0 ]; then
+ have_all_size_hugepgs=false
+ break
+ fi
+done
+if $have_all_size_hugepgs; then
+ CATEGORY="userfaultfd" run_test ./uffd-wp-mremap
+else
+ echo "# SKIP ./uffd-wp-mremap"
+fi
+
+#cleanup
+for f in "${!nr_size_hugepgs[@]}"; do
+ echo "${nr_size_hugepgs["$f"]}" > "$f"
+done
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
+
+CATEGORY="compaction" run_test ./compaction_test
+
+if command -v sudo &> /dev/null && sudo -u nobody ls ./on-fault-limit >/dev/null;
+then
+ CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+else
+ echo "# SKIP ./on-fault-limit"
+fi
+
+CATEGORY="mmap" run_test ./map_populate
+
+CATEGORY="mlock" run_test ./mlock-random-test
+
+CATEGORY="mlock" run_test ./mlock2-tests
+
+CATEGORY="process_mrelease" run_test ./mrelease_test
+
+CATEGORY="mremap" run_test ./mremap_test
+
+CATEGORY="hugetlb" run_test ./thuge-gen
+CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2
+CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2
+if $RUN_DESTRUCTIVE; then
+nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
+enable_soft_offline=$(cat /proc/sys/vm/enable_soft_offline)
+echo 8 > /proc/sys/vm/nr_hugepages
+CATEGORY="hugetlb" run_test ./hugetlb-soft-offline
+echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
+echo "$enable_soft_offline" > /proc/sys/vm/enable_soft_offline
+CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
+fi
+
+if [ $VADDR64 -ne 0 ]; then
+
+ # set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel
+ # allows high virtual address allocation requests independent
+ # of platform's physical memory.
+
+ if [ -x ./virtual_address_range ]; then
+ prev_policy=$(cat /proc/sys/vm/overcommit_memory)
+ echo 1 > /proc/sys/vm/overcommit_memory
+ CATEGORY="hugevm" run_test ./virtual_address_range
+ echo $prev_policy > /proc/sys/vm/overcommit_memory
+ fi
+
+ # va high address boundary switch test
+ ARCH_ARM64="arm64"
+ prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages)
+ if [ "$ARCH" == "$ARCH_ARM64" ]; then
+ echo 6 > /proc/sys/vm/nr_hugepages
+ fi
+ CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh
+ if [ "$ARCH" == "$ARCH_ARM64" ]; then
+ echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages
+ fi
+fi # VADDR64
+
+# vmalloc stability smoke test
+CATEGORY="vmalloc" run_test bash ./test_vmalloc.sh smoke
+
+CATEGORY="mremap" run_test ./mremap_dontunmap
+
+CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
+
+# MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests
+CATEGORY="madv_guard" run_test ./guard-regions
+
+# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+CATEGORY="madv_populate" run_test ./madv_populate
+
+CATEGORY="vma_merge" run_test ./merge
+
+if [ -x ./memfd_secret ]
+then
+(echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix
+CATEGORY="memfd_secret" run_test ./memfd_secret
+fi
+
+# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
+if [ "${HAVE_HUGEPAGES}" = "1" ]; then
+ CATEGORY="ksm" run_test ./ksm_tests -H -s 100
+fi
+# KSM KSM_MERGE_TIME test with size of 100
+CATEGORY="ksm" run_test ./ksm_tests -P -s 100
+# KSM MADV_MERGEABLE test with 10 identical pages
+CATEGORY="ksm" run_test ./ksm_tests -M -p 10
+# KSM unmerge test
+CATEGORY="ksm" run_test ./ksm_tests -U
+# KSM test with 10 zero pages and use_zero_pages = 0
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
+# KSM test with 10 zero pages and use_zero_pages = 1
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
+# KSM test with 2 NUMA nodes and merge_across_nodes = 1
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
+# KSM test with 2 NUMA nodes and merge_across_nodes = 0
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
+
+CATEGORY="ksm" run_test ./ksm_functional_tests
+
+# protection_keys tests
+nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+if [ -x ./protection_keys_32 ]
+then
+ CATEGORY="pkey" run_test ./protection_keys_32
+fi
+
+if [ -x ./protection_keys_64 ]
+then
+ CATEGORY="pkey" run_test ./protection_keys_64
+fi
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
+
+if [ -x ./soft-dirty ]
+then
+ CATEGORY="soft_dirty" run_test ./soft-dirty
+fi
+
+CATEGORY="pagemap" run_test ./pagemap_ioctl
+
+CATEGORY="pfnmap" run_test ./pfnmap
+
+# COW tests
+CATEGORY="cow" run_test ./cow
+
+CATEGORY="thp" run_test ./khugepaged
+
+CATEGORY="thp" run_test ./khugepaged -s 2
+
+CATEGORY="thp" run_test ./transhuge-stress -d 20
+
+# Try to create XFS if not provided
+if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then
+ if [ "${HAVE_HUGEPAGES}" = "1" ]; then
+ if test_selected "thp"; then
+ if grep xfs /proc/filesystems &>/dev/null; then
+ XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX)
+ SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX)
+ truncate -s 314572800 ${XFS_IMG}
+ mkfs.xfs -q ${XFS_IMG}
+ mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+ MOUNTED_XFS=1
+ fi
+ fi
+ fi
+fi
+
+CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+
+if [ -n "${MOUNTED_XFS}" ]; then
+ umount ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+ rmdir ${SPLIT_HUGE_PAGE_TEST_XFS_PATH}
+ rm -f ${XFS_IMG}
+fi
+
+CATEGORY="migration" run_test ./migration
+
+CATEGORY="mkdirty" run_test ./mkdirty
+
+CATEGORY="mdwe" run_test ./mdwe_test
+
+CATEGORY="page_frag" run_test ./test_page_frag.sh smoke
+
+CATEGORY="page_frag" run_test ./test_page_frag.sh aligned
+
+CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned
+
+echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix
+echo "1..${count_total}" | tap_output
+
+exit $exitcode
diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings
new file mode 100644
index 000000000000..e2206265f67c
--- /dev/null
+++ b/tools/testing/selftests/mm/settings
@@ -0,0 +1 @@
+timeout=900
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
new file mode 100644
index 000000000000..8e1462ce0532
--- /dev/null
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <sys/mman.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define PAGEMAP_FILE_PATH "/proc/self/pagemap"
+#define TEST_ITERATIONS 10000
+
+static void test_simple(int pagemap_fd, int pagesize)
+{
+ int i;
+ char *map;
+
+ map = aligned_alloc(pagesize, pagesize);
+ if (!map)
+ ksft_exit_fail_msg("mmap failed\n");
+
+ clear_softdirty();
+
+ for (i = 0 ; i < TEST_ITERATIONS; i++) {
+ if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
+ ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
+ break;
+ }
+
+ clear_softdirty();
+ // Write something to the page to get the dirty bit enabled on the page
+ map[0]++;
+
+ if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
+ ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
+ break;
+ }
+
+ clear_softdirty();
+ }
+ free(map);
+
+ ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__);
+}
+
+static void test_vma_reuse(int pagemap_fd, int pagesize)
+{
+ char *map, *map2;
+
+ map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed");
+
+ // The kernel always marks new regions as soft dirty
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+ "Test %s dirty bit of allocated page\n", __func__);
+
+ clear_softdirty();
+ munmap(map, pagesize);
+
+ map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
+ if (map2 == MAP_FAILED)
+ ksft_exit_fail_msg("mmap failed");
+
+ // Dirty bit is set for new regions even if they are reused
+ if (map == map2)
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
+ "Test %s dirty bit of reused address page\n", __func__);
+ else
+ ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__);
+
+ munmap(map2, pagesize);
+}
+
+static void test_hugepage(int pagemap_fd, int pagesize)
+{
+ char *map;
+ int i, ret;
+ size_t hpage_len = read_pmd_pagesize();
+
+ if (!hpage_len)
+ ksft_exit_fail_msg("Reading PMD pagesize failed");
+
+ map = memalign(hpage_len, hpage_len);
+ if (!map)
+ ksft_exit_fail_msg("memalign failed\n");
+
+ ret = madvise(map, hpage_len, MADV_HUGEPAGE);
+ if (ret)
+ ksft_exit_fail_msg("madvise failed %d\n", ret);
+
+ for (i = 0; i < hpage_len; i++)
+ map[i] = (char)i;
+
+ if (check_huge_anon(map, 1, hpage_len)) {
+ ksft_test_result_pass("Test %s huge page allocation\n", __func__);
+
+ clear_softdirty();
+ for (i = 0 ; i < TEST_ITERATIONS ; i++) {
+ if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
+ ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
+ break;
+ }
+
+ clear_softdirty();
+ // Write something to the page to get the dirty bit enabled on the page
+ map[0]++;
+
+ if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
+ ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
+ break;
+ }
+ clear_softdirty();
+ }
+
+ ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__);
+ } else {
+ // hugepage allocation failed. skip these tests
+ ksft_test_result_skip("Test %s huge page allocation\n", __func__);
+ ksft_test_result_skip("Test %s huge page dirty bit\n", __func__);
+ }
+ free(map);
+}
+
+static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
+{
+ const char *type[] = {"file", "anon"};
+ const char *fname = "./soft-dirty-test-file";
+ int test_fd = 0;
+ char *map;
+
+ if (anon) {
+ map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ if (!map)
+ ksft_exit_fail_msg("anon mmap failed\n");
+ } else {
+ test_fd = open(fname, O_RDWR | O_CREAT, 0664);
+ if (test_fd < 0) {
+ ksft_test_result_skip("Test %s open() file failed\n", __func__);
+ return;
+ }
+ unlink(fname);
+ ftruncate(test_fd, pagesize);
+ map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
+ MAP_SHARED, test_fd, 0);
+ if (!map)
+ ksft_exit_fail_msg("file mmap failed\n");
+ }
+
+ *map = 1;
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+ "Test %s-%s dirty bit of new written page\n",
+ __func__, type[anon]);
+ clear_softdirty();
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+ "Test %s-%s soft-dirty clear after clear_refs\n",
+ __func__, type[anon]);
+ mprotect(map, pagesize, PROT_READ);
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+ "Test %s-%s soft-dirty clear after marking RO\n",
+ __func__, type[anon]);
+ mprotect(map, pagesize, PROT_READ|PROT_WRITE);
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+ "Test %s-%s soft-dirty clear after marking RW\n",
+ __func__, type[anon]);
+ *map = 2;
+ ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+ "Test %s-%s soft-dirty after rewritten\n",
+ __func__, type[anon]);
+
+ munmap(map, pagesize);
+
+ if (!anon)
+ close(test_fd);
+}
+
+static void test_mprotect_anon(int pagemap_fd, int pagesize)
+{
+ test_mprotect(pagemap_fd, pagesize, true);
+}
+
+static void test_mprotect_file(int pagemap_fd, int pagesize)
+{
+ test_mprotect(pagemap_fd, pagesize, false);
+}
+
+int main(int argc, char **argv)
+{
+ int pagemap_fd;
+ int pagesize;
+
+ ksft_print_header();
+ ksft_set_plan(15);
+
+ pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH);
+
+ pagesize = getpagesize();
+
+ test_simple(pagemap_fd, pagesize);
+ test_vma_reuse(pagemap_fd, pagesize);
+ test_hugepage(pagemap_fd, pagesize);
+ test_mprotect_anon(pagemap_fd, pagesize);
+ test_mprotect_file(pagemap_fd, pagesize);
+
+ close(pagemap_fd);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
new file mode 100644
index 000000000000..aa7400ed0e99
--- /dev/null
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages interface.
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <malloc.h>
+#include <stdbool.h>
+#include <time.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define SMAP_PATH "/proc/self/smaps"
+#define INPUT_MAX 80
+
+#define PID_FMT "%d,0x%lx,0x%lx,%d"
+#define PATH_FMT "%s,0x%lx,0x%lx,%d"
+
+#define PFN_MASK ((1UL<<55)-1)
+#define KPF_THP (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+ uint64_t paddr;
+ uint64_t page_flags;
+
+ if (pagemap_file) {
+ pread(pagemap_file, &paddr, sizeof(paddr),
+ ((long)vaddr >> pageshift) * sizeof(paddr));
+
+ if (kpageflags_file) {
+ pread(kpageflags_file, &page_flags, sizeof(page_flags),
+ (paddr & PFN_MASK) * sizeof(page_flags));
+
+ return !!(page_flags & KPF_THP);
+ }
+ }
+ return 0;
+}
+
+static void write_file(const char *path, const char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numwritten;
+
+ fd = open(path, O_WRONLY);
+ if (fd == -1)
+ ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno));
+
+ numwritten = write(fd, buf, buflen - 1);
+ close(fd);
+ if (numwritten < 1)
+ ksft_exit_fail_msg("Write failed\n");
+}
+
+static void write_debugfs(const char *fmt, ...)
+{
+ char input[INPUT_MAX];
+ int ret;
+ va_list argp;
+
+ va_start(argp, fmt);
+ ret = vsnprintf(input, INPUT_MAX, fmt, argp);
+ va_end(argp);
+
+ if (ret >= INPUT_MAX)
+ ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__);
+
+ write_file(SPLIT_DEBUGFS, input, ret + 1);
+}
+
+static char *allocate_zero_filled_hugepage(size_t len)
+{
+ char *result;
+ size_t i;
+
+ result = memalign(pmd_pagesize, len);
+ if (!result) {
+ printf("Fail to allocate memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ madvise(result, len, MADV_HUGEPAGE);
+
+ for (i = 0; i < len; i++)
+ result[i] = (char)0;
+
+ return result;
+}
+
+static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
+{
+ unsigned long rss_anon_before, rss_anon_after;
+ size_t i;
+
+ if (!check_huge_anon(one_page, 4, pmd_pagesize))
+ ksft_exit_fail_msg("No THP is allocated\n");
+
+ rss_anon_before = rss_anon();
+ if (!rss_anon_before)
+ ksft_exit_fail_msg("No RssAnon is allocated before split\n");
+
+ /* split all THPs */
+ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+ (uint64_t)one_page + len, 0);
+
+ for (i = 0; i < len; i++)
+ if (one_page[i] != (char)0)
+ ksft_exit_fail_msg("%ld byte corrupted\n", i);
+
+ if (!check_huge_anon(one_page, 0, pmd_pagesize))
+ ksft_exit_fail_msg("Still AnonHugePages not split\n");
+
+ rss_anon_after = rss_anon();
+ if (rss_anon_after >= rss_anon_before)
+ ksft_exit_fail_msg("Incorrect RssAnon value. Before: %ld After: %ld\n",
+ rss_anon_before, rss_anon_after);
+}
+
+void split_pmd_zero_pages(void)
+{
+ char *one_page;
+ int nr_hpages = 4;
+ size_t len = nr_hpages * pmd_pagesize;
+
+ one_page = allocate_zero_filled_hugepage(len);
+ verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
+ ksft_test_result_pass("Split zero filled huge pages successful\n");
+ free(one_page);
+}
+
+void split_pmd_thp_to_order(int order)
+{
+ char *one_page;
+ size_t len = 4 * pmd_pagesize;
+ size_t i;
+
+ one_page = memalign(pmd_pagesize, len);
+ if (!one_page)
+ ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
+
+ madvise(one_page, len, MADV_HUGEPAGE);
+
+ for (i = 0; i < len; i++)
+ one_page[i] = (char)i;
+
+ if (!check_huge_anon(one_page, 4, pmd_pagesize))
+ ksft_exit_fail_msg("No THP is allocated\n");
+
+ /* split all THPs */
+ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+ (uint64_t)one_page + len, order);
+
+ for (i = 0; i < len; i++)
+ if (one_page[i] != (char)i)
+ ksft_exit_fail_msg("%ld byte corrupted\n", i);
+
+
+ if (!check_huge_anon(one_page, 0, pmd_pagesize))
+ ksft_exit_fail_msg("Still AnonHugePages not split\n");
+
+ ksft_test_result_pass("Split huge pages to order %d successful\n", order);
+ free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+ char *one_page, *pte_mapped, *pte_mapped2;
+ size_t len = 4 * pmd_pagesize;
+ uint64_t thp_size;
+ size_t i;
+ const char *pagemap_template = "/proc/%d/pagemap";
+ const char *kpageflags_proc = "/proc/kpageflags";
+ char pagemap_proc[255];
+ int pagemap_fd;
+ int kpageflags_fd;
+
+ if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0)
+ ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno));
+
+ pagemap_fd = open(pagemap_proc, O_RDONLY);
+ if (pagemap_fd == -1)
+ ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno));
+
+ kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+ if (kpageflags_fd == -1)
+ ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno));
+
+ one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (one_page == MAP_FAILED)
+ ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
+
+ madvise(one_page, len, MADV_HUGEPAGE);
+
+ for (i = 0; i < len; i++)
+ one_page[i] = (char)i;
+
+ if (!check_huge_anon(one_page, 4, pmd_pagesize))
+ ksft_exit_fail_msg("No THP is allocated\n");
+
+ /* remap the first pagesize of first THP */
+ pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+ /* remap the Nth pagesize of Nth THP */
+ for (i = 1; i < 4; i++) {
+ pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+ pagesize, pagesize,
+ MREMAP_MAYMOVE|MREMAP_FIXED,
+ pte_mapped + pagesize * i);
+ if (pte_mapped2 == MAP_FAILED)
+ ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno));
+ }
+
+ /* smap does not show THPs after mremap, use kpageflags instead */
+ thp_size = 0;
+ for (i = 0; i < pagesize * 4; i++)
+ if (i % pagesize == 0 &&
+ is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+ thp_size++;
+
+ if (thp_size != 4)
+ ksft_exit_fail_msg("Some THPs are missing during mremap\n");
+
+ /* split all remapped THPs */
+ write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
+ (uint64_t)pte_mapped + pagesize * 4, 0);
+
+ /* smap does not show THPs after mremap, use kpageflags instead */
+ thp_size = 0;
+ for (i = 0; i < pagesize * 4; i++) {
+ if (pte_mapped[i] != (char)i)
+ ksft_exit_fail_msg("%ld byte corrupted\n", i);
+
+ if (i % pagesize == 0 &&
+ is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+ thp_size++;
+ }
+
+ if (thp_size)
+ ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size);
+
+ ksft_test_result_pass("Split PTE-mapped huge pages successful\n");
+ munmap(one_page, len);
+ close(pagemap_fd);
+ close(kpageflags_fd);
+}
+
+void split_file_backed_thp(int order)
+{
+ int status;
+ int fd;
+ char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
+ const char *tmpfs_loc = mkdtemp(tmpfs_template);
+ char testfile[INPUT_MAX];
+ ssize_t num_written, num_read;
+ char *file_buf1, *file_buf2;
+ uint64_t pgoff_start = 0, pgoff_end = 1024;
+ int i;
+
+ ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n");
+
+ file_buf1 = (char *)malloc(pmd_pagesize);
+ file_buf2 = (char *)malloc(pmd_pagesize);
+
+ if (!file_buf1 || !file_buf2) {
+ ksft_print_msg("cannot allocate file buffers\n");
+ goto out;
+ }
+
+ for (i = 0; i < pmd_pagesize; i++)
+ file_buf1[i] = (char)i;
+ memset(file_buf2, 0, pmd_pagesize);
+
+ status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+
+ if (status)
+ ksft_exit_fail_msg("Unable to create a tmpfs for testing\n");
+
+ status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
+ if (status >= INPUT_MAX) {
+ ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n");
+ goto cleanup;
+ }
+
+ fd = open(testfile, O_CREAT|O_RDWR, 0664);
+ if (fd == -1) {
+ ksft_perror("Cannot open testing file");
+ goto cleanup;
+ }
+
+ /* write pmd size data to the file, so a file-backed THP can be allocated */
+ num_written = write(fd, file_buf1, pmd_pagesize);
+
+ if (num_written == -1 || num_written != pmd_pagesize) {
+ ksft_perror("Failed to write data to testing file");
+ goto close_file;
+ }
+
+ /* split the file-backed THP */
+ write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, order);
+
+ /* check file content after split */
+ status = lseek(fd, 0, SEEK_SET);
+ if (status == -1) {
+ ksft_perror("Cannot lseek file");
+ goto close_file;
+ }
+
+ num_read = read(fd, file_buf2, num_written);
+ if (num_read == -1 || num_read != num_written) {
+ ksft_perror("Cannot read file content back");
+ goto close_file;
+ }
+
+ if (strncmp(file_buf1, file_buf2, pmd_pagesize) != 0) {
+ ksft_print_msg("File content changed\n");
+ goto close_file;
+ }
+
+ close(fd);
+ status = unlink(testfile);
+ if (status) {
+ ksft_perror("Cannot remove testing file");
+ goto cleanup;
+ }
+
+ status = umount(tmpfs_loc);
+ if (status) {
+ rmdir(tmpfs_loc);
+ ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc);
+ }
+
+ status = rmdir(tmpfs_loc);
+ if (status)
+ ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno));
+
+ ksft_print_msg("Please check dmesg for more information\n");
+ ksft_test_result_pass("File-backed THP split to order %d test done\n", order);
+ return;
+
+close_file:
+ close(fd);
+cleanup:
+ umount(tmpfs_loc);
+ rmdir(tmpfs_loc);
+out:
+ ksft_exit_fail_msg("Error occurred\n");
+}
+
+bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template,
+ const char **thp_fs_loc)
+{
+ if (xfs_path) {
+ *thp_fs_loc = xfs_path;
+ return false;
+ }
+
+ *thp_fs_loc = mkdtemp(thp_fs_template);
+
+ if (!*thp_fs_loc)
+ ksft_exit_fail_msg("cannot create temp folder\n");
+
+ return true;
+}
+
+void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp)
+{
+ int status;
+
+ if (!created_tmp)
+ return;
+
+ status = rmdir(thp_fs_loc);
+ if (status)
+ ksft_exit_fail_msg("cannot remove tmp dir: %s\n",
+ strerror(errno));
+}
+
+int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
+ char **addr)
+{
+ size_t i;
+ int dummy = 0;
+ unsigned char buf[1024];
+
+ srand(time(NULL));
+
+ *fd = open(testfile, O_CREAT | O_RDWR, 0664);
+ if (*fd == -1)
+ ksft_exit_fail_msg("Failed to create a file at %s\n", testfile);
+
+ assert(fd_size % sizeof(buf) == 0);
+ for (i = 0; i < sizeof(buf); i++)
+ buf[i] = (unsigned char)i;
+ for (i = 0; i < fd_size; i += sizeof(buf))
+ write(*fd, buf, sizeof(buf));
+
+ close(*fd);
+ sync();
+ *fd = open("/proc/sys/vm/drop_caches", O_WRONLY);
+ if (*fd == -1) {
+ ksft_perror("open drop_caches");
+ goto err_out_unlink;
+ }
+ if (write(*fd, "3", 1) != 1) {
+ ksft_perror("write to drop_caches");
+ goto err_out_unlink;
+ }
+ close(*fd);
+
+ *fd = open(testfile, O_RDWR);
+ if (*fd == -1) {
+ ksft_perror("Failed to open testfile\n");
+ goto err_out_unlink;
+ }
+
+ *addr = mmap(NULL, fd_size, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
+ if (*addr == (char *)-1) {
+ ksft_perror("cannot mmap");
+ goto err_out_close;
+ }
+ madvise(*addr, fd_size, MADV_HUGEPAGE);
+
+ for (size_t i = 0; i < fd_size; i++)
+ dummy += *(*addr + i);
+ asm volatile("" : "+r" (dummy));
+
+ if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) {
+ ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n");
+ munmap(*addr, fd_size);
+ close(*fd);
+ unlink(testfile);
+ ksft_test_result_skip("Pagecache folio split skipped\n");
+ return -2;
+ }
+ return 0;
+err_out_close:
+ close(*fd);
+err_out_unlink:
+ unlink(testfile);
+ ksft_exit_fail_msg("Failed to create large pagecache folios\n");
+ return -1;
+}
+
+void split_thp_in_pagecache_to_order_at(size_t fd_size, const char *fs_loc,
+ int order, int offset)
+{
+ int fd;
+ char *addr;
+ size_t i;
+ char testfile[INPUT_MAX];
+ int err = 0;
+
+ err = snprintf(testfile, INPUT_MAX, "%s/test", fs_loc);
+
+ if (err < 0)
+ ksft_exit_fail_msg("cannot generate right test file name\n");
+
+ err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr);
+ if (err)
+ return;
+ err = 0;
+
+ if (offset == -1)
+ write_debugfs(PID_FMT, getpid(), (uint64_t)addr,
+ (uint64_t)addr + fd_size, order);
+ else
+ write_debugfs(PID_FMT, getpid(), (uint64_t)addr,
+ (uint64_t)addr + fd_size, order, offset);
+
+ for (i = 0; i < fd_size; i++)
+ if (*(addr + i) != (char)i) {
+ ksft_print_msg("%lu byte corrupted in the file\n", i);
+ err = EXIT_FAILURE;
+ goto out;
+ }
+
+ if (!check_huge_file(addr, 0, pmd_pagesize)) {
+ ksft_print_msg("Still FilePmdMapped not split\n");
+ err = EXIT_FAILURE;
+ goto out;
+ }
+
+out:
+ munmap(addr, fd_size);
+ close(fd);
+ unlink(testfile);
+ if (offset == -1) {
+ if (err)
+ ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d failed\n", order);
+ ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d passed\n", order);
+ } else {
+ if (err)
+ ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d at in-folio offset %d failed\n", order, offset);
+ ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d at in-folio offset %d passed\n", order, offset);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int i;
+ size_t fd_size;
+ char *optional_xfs_path = NULL;
+ char fs_loc_template[] = "/tmp/thp_fs_XXXXXX";
+ const char *fs_loc;
+ bool created_tmp;
+ int offset;
+
+ ksft_print_header();
+
+ if (geteuid() != 0) {
+ ksft_print_msg("Please run the benchmark as root\n");
+ ksft_finished();
+ }
+
+ if (argc > 1)
+ optional_xfs_path = argv[1];
+
+ ksft_set_plan(1+8+1+9+9+8*4+2);
+
+ pagesize = getpagesize();
+ pageshift = ffs(pagesize) - 1;
+ pmd_pagesize = read_pmd_pagesize();
+ if (!pmd_pagesize)
+ ksft_exit_fail_msg("Reading PMD pagesize failed\n");
+
+ fd_size = 2 * pmd_pagesize;
+
+ split_pmd_zero_pages();
+
+ for (i = 0; i < 9; i++)
+ if (i != 1)
+ split_pmd_thp_to_order(i);
+
+ split_pte_mapped_thp();
+ for (i = 0; i < 9; i++)
+ split_file_backed_thp(i);
+
+ created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template,
+ &fs_loc);
+ for (i = 8; i >= 0; i--)
+ split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, -1);
+
+ for (i = 0; i < 9; i++)
+ for (offset = 0;
+ offset < pmd_pagesize / pagesize;
+ offset += MAX(pmd_pagesize / pagesize / 4, 1 << i))
+ split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, offset);
+ cleanup_thp_fs(fs_loc, created_tmp);
+
+ ksft_finished();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh
new file mode 100755
index 000000000000..46e19b5d648d
--- /dev/null
+++ b/tools/testing/selftests/mm/test_hmm.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of vmalloc allocations;
+# b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="test_hmm"
+DRIVER="test_hmm"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit $ksft_skip
+ fi
+
+ if ! modinfo $DRIVER > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_HMM=m"
+ exit $ksft_skip
+ fi
+}
+
+load_driver()
+{
+ if [ $# -eq 0 ]; then
+ modprobe $DRIVER > /dev/null 2>&1
+ else
+ if [ $# -eq 2 ]; then
+ modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+ > /dev/null 2>&1
+ else
+ echo "Missing module parameters. Make sure pass"\
+ "spm_addr_dev0 and spm_addr_dev1"
+ usage
+ fi
+ fi
+}
+
+unload_driver()
+{
+ modprobe -r $DRIVER > /dev/null 2>&1
+}
+
+run_smoke()
+{
+ echo "Running smoke test. Note, this test provides basic coverage."
+
+ load_driver $1 $2
+ $(dirname "${BASH_SOURCE[0]}")/hmm-tests
+ unload_driver
+}
+
+usage()
+{
+ echo -n "Usage: $0"
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "./${TEST_NAME}.sh"
+ echo
+ echo "# Smoke testing"
+ echo "./${TEST_NAME}.sh smoke"
+ echo
+ echo "# Smoke testing with SPM enabled"
+ echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>"
+ echo
+ exit 0
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [ "$1" = "smoke" ]; then
+ run_smoke $2 $3
+ else
+ usage
+ fi
+ fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/test_page_frag.sh b/tools/testing/selftests/mm/test_page_frag.sh
new file mode 100755
index 000000000000..f55b105084cf
--- /dev/null
+++ b/tools/testing/selftests/mm/test_page_frag.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2024 Yunsheng Lin <linyunsheng@huawei.com>
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to test the
+# correctness and performance of page_frag's implementation.
+# Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of page fragment allocations;
+# b) stressing and stability check of page_frag subsystem.
+
+DRIVER="./page_frag/page_frag_test.ko"
+CPU_LIST=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2)
+TEST_CPU_0=$(echo $CPU_LIST | awk '{print $1}')
+
+if [ $(echo $CPU_LIST | wc -w) -gt 1 ]; then
+ TEST_CPU_1=$(echo $CPU_LIST | awk '{print $2}')
+ NR_TEST=100000000
+else
+ TEST_CPU_1=$TEST_CPU_0
+ NR_TEST=1000000
+fi
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_test_failed_prefix() {
+ if dmesg | grep -q 'page_frag_test failed:';then
+ echo "page_frag_test failed, please check dmesg"
+ exit $exitcode
+ fi
+}
+
+#
+# Static templates for testing of page_frag APIs.
+# Also it is possible to pass any supported parameters manually.
+#
+SMOKE_PARAM="test_push_cpu=$TEST_CPU_0 test_pop_cpu=$TEST_CPU_1"
+NONALIGNED_PARAM="$SMOKE_PARAM test_alloc_len=75 nr_test=$NR_TEST"
+ALIGNED_PARAM="$NONALIGNED_PARAM test_align=1"
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which insmod > /dev/null 2>&1; then
+ echo "$0: You need insmod installed"
+ exit $ksft_skip
+ fi
+
+ if [ ! -f $DRIVER ]; then
+ echo "$0: You need to compile page_frag_test module"
+ exit $ksft_skip
+ fi
+}
+
+run_nonaligned_check()
+{
+ echo "Run performance tests to evaluate how fast nonaligned alloc API is."
+
+ insmod $DRIVER $NONALIGNED_PARAM > /dev/null 2>&1
+}
+
+run_aligned_check()
+{
+ echo "Run performance tests to evaluate how fast aligned alloc API is."
+
+ insmod $DRIVER $ALIGNED_PARAM > /dev/null 2>&1
+}
+
+run_smoke_check()
+{
+ echo "Run smoke test."
+
+ insmod $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+}
+
+usage()
+{
+ echo -n "Usage: $0 [ aligned ] | [ nonaligned ] | | [ smoke ] | "
+ echo "manual parameters"
+ echo
+ echo "Valid tests and parameters:"
+ echo
+ modinfo $DRIVER
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "$0"
+ echo
+ echo "# Smoke testing"
+ echo "$0 smoke"
+ echo
+ echo "# Performance testing for nonaligned alloc API"
+ echo "$0 nonaligned"
+ echo
+ echo "# Performance testing for aligned alloc API"
+ echo "$0 aligned"
+ echo
+ exit 0
+}
+
+function validate_passed_args()
+{
+ VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+ #
+ # Something has been passed, check it.
+ #
+ for passed_arg in $@; do
+ key=${passed_arg//=*/}
+ valid=0
+
+ for valid_arg in $VALID_ARGS; do
+ if [[ $key = $valid_arg ]]; then
+ valid=1
+ break
+ fi
+ done
+
+ if [[ $valid -ne 1 ]]; then
+ echo "Error: key is not correct: ${key}"
+ exit $exitcode
+ fi
+ done
+}
+
+function run_manual_check()
+{
+ #
+ # Validate passed parameters. If there is wrong one,
+ # the script exists and does not execute further.
+ #
+ validate_passed_args $@
+
+ echo "Run the test with following parameters: $@"
+ insmod $DRIVER $@ > /dev/null 2>&1
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [[ "$1" = "smoke" ]]; then
+ run_smoke_check
+ elif [[ "$1" = "nonaligned" ]]; then
+ run_nonaligned_check
+ elif [[ "$1" = "aligned" ]]; then
+ run_aligned_check
+ else
+ run_manual_check $@
+ fi
+ fi
+
+ check_test_failed_prefix
+
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
new file mode 100755
index 000000000000..d73b846736f1
--- /dev/null
+++ b/tools/testing/selftests/mm/test_vmalloc.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of vmalloc allocations;
+# b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="vmalloc"
+DRIVER="test_${TEST_NAME}"
+NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# Static templates for performance, stressing and smoke tests.
+# Also it is possible to pass any supported parameters manualy.
+#
+PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
+SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
+STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit $ksft_skip
+ fi
+
+ if ! modinfo $DRIVER > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_VMALLOC=m"
+ exit $ksft_skip
+ fi
+}
+
+run_perfformance_check()
+{
+ echo "Run performance tests to evaluate how fast vmalloc allocation is."
+ echo "It runs all test cases on one single CPU with sequential order."
+
+ modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Ccheck the kernel message buffer to see the summary."
+}
+
+run_stability_check()
+{
+ echo "Run stability tests. In order to stress vmalloc subsystem all"
+ echo "available test cases are run by NUM_CPUS workers simultaneously."
+ echo "It will take time, so be patient."
+
+ modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+run_smoke_check()
+{
+ echo "Run smoke test. Note, this test provides basic coverage."
+ echo "Please check $0 output how it can be used"
+ echo "for deep performance analysis as well as stress testing."
+
+ modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+usage()
+{
+ echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
+ echo "manual parameters"
+ echo
+ echo "Valid tests and parameters:"
+ echo
+ modinfo $DRIVER
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "./${DRIVER}.sh"
+ echo
+ echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers"
+ echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5"
+ echo
+ echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
+ echo "sequential order"
+ echo -n "./${DRIVER}.sh sequential_test_order=1 "
+ echo "run_test_mask=23"
+ echo
+ echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats "
+ echo "20 times"
+ echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20"
+ echo
+ echo "# Performance analysis"
+ echo "./${DRIVER}.sh performance"
+ echo
+ echo "# Stress testing"
+ echo "./${DRIVER}.sh stress"
+ echo
+ exit 0
+}
+
+function validate_passed_args()
+{
+ VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+ #
+ # Something has been passed, check it.
+ #
+ for passed_arg in $@; do
+ key=${passed_arg//=*/}
+ val="${passed_arg:$((${#key}+1))}"
+ valid=0
+
+ for valid_arg in $VALID_ARGS; do
+ if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
+ valid=1
+ break
+ fi
+ done
+
+ if [[ $valid -ne 1 ]]; then
+ echo "Error: key or value is not correct: ${key} $val"
+ exit $exitcode
+ fi
+ done
+}
+
+function run_manual_check()
+{
+ #
+ # Validate passed parameters. If there is wrong one,
+ # the script exists and does not execute further.
+ #
+ validate_passed_args $@
+
+ echo "Run the test with following parameters: $@"
+ modprobe $DRIVER $@ > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [[ "$1" = "performance" ]]; then
+ run_perfformance_check
+ elif [[ "$1" = "stress" ]]; then
+ run_stability_check
+ elif [[ "$1" = "smoke" ]]; then
+ run_smoke_check
+ else
+ run_manual_check $@
+ fi
+ fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c
new file mode 100644
index 000000000000..ad872af1c81a
--- /dev/null
+++ b/tools/testing/selftests/mm/thp_settings.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "thp_settings.h"
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+#define MAX_SETTINGS_DEPTH 4
+static struct thp_settings settings_stack[MAX_SETTINGS_DEPTH];
+static int settings_index;
+static struct thp_settings saved_settings;
+static char dev_queue_read_ahead_path[PATH_MAX];
+
+static const char * const thp_enabled_strings[] = {
+ "never",
+ "always",
+ "inherit",
+ "madvise",
+ NULL
+};
+
+static const char * const thp_defrag_strings[] = {
+ "always",
+ "defer",
+ "defer+madvise",
+ "madvise",
+ "never",
+ NULL
+};
+
+static const char * const shmem_enabled_strings[] = {
+ "never",
+ "always",
+ "within_size",
+ "advise",
+ "inherit",
+ "deny",
+ "force",
+ NULL
+};
+
+int read_file(const char *path, char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numread;
+
+ fd = open(path, O_RDONLY);
+ if (fd == -1)
+ return 0;
+
+ numread = read(fd, buf, buflen - 1);
+ if (numread < 1) {
+ close(fd);
+ return 0;
+ }
+
+ buf[numread] = '\0';
+ close(fd);
+
+ return (unsigned int) numread;
+}
+
+int write_file(const char *path, const char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numwritten;
+
+ fd = open(path, O_WRONLY);
+ if (fd == -1) {
+ printf("open(%s)\n", path);
+ exit(EXIT_FAILURE);
+ return 0;
+ }
+
+ numwritten = write(fd, buf, buflen - 1);
+ close(fd);
+ if (numwritten < 1) {
+ printf("write(%s)\n", buf);
+ exit(EXIT_FAILURE);
+ return 0;
+ }
+
+ return (unsigned int) numwritten;
+}
+
+unsigned long read_num(const char *path)
+{
+ char buf[21];
+
+ if (read_file(path, buf, sizeof(buf)) < 0) {
+ perror("read_file()");
+ exit(EXIT_FAILURE);
+ }
+
+ return strtoul(buf, NULL, 10);
+}
+
+void write_num(const char *path, unsigned long num)
+{
+ char buf[21];
+
+ sprintf(buf, "%ld", num);
+ if (!write_file(path, buf, strlen(buf) + 1)) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+}
+
+int thp_read_string(const char *name, const char * const strings[])
+{
+ char path[PATH_MAX];
+ char buf[256];
+ char *c;
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!read_file(path, buf, sizeof(buf))) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+
+ c = strchr(buf, '[');
+ if (!c) {
+ printf("%s: Parse failure\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ c++;
+ memmove(buf, c, sizeof(buf) - (c - buf));
+
+ c = strchr(buf, ']');
+ if (!c) {
+ printf("%s: Parse failure\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ *c = '\0';
+
+ ret = 0;
+ while (strings[ret]) {
+ if (!strcmp(strings[ret], buf))
+ return ret;
+ ret++;
+ }
+
+ printf("Failed to parse %s\n", name);
+ exit(EXIT_FAILURE);
+}
+
+void thp_write_string(const char *name, const char *val)
+{
+ char path[PATH_MAX];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!write_file(path, val, strlen(val) + 1)) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+}
+
+unsigned long thp_read_num(const char *name)
+{
+ char path[PATH_MAX];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ return read_num(path);
+}
+
+void thp_write_num(const char *name, unsigned long num)
+{
+ char path[PATH_MAX];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ write_num(path, num);
+}
+
+void thp_read_settings(struct thp_settings *settings)
+{
+ unsigned long orders = thp_supported_orders();
+ unsigned long shmem_orders = thp_shmem_supported_orders();
+ char path[PATH_MAX];
+ int i;
+
+ *settings = (struct thp_settings) {
+ .thp_enabled = thp_read_string("enabled", thp_enabled_strings),
+ .thp_defrag = thp_read_string("defrag", thp_defrag_strings),
+ .shmem_enabled =
+ thp_read_string("shmem_enabled", shmem_enabled_strings),
+ .use_zero_page = thp_read_num("use_zero_page"),
+ };
+ settings->khugepaged = (struct khugepaged_settings) {
+ .defrag = thp_read_num("khugepaged/defrag"),
+ .alloc_sleep_millisecs =
+ thp_read_num("khugepaged/alloc_sleep_millisecs"),
+ .scan_sleep_millisecs =
+ thp_read_num("khugepaged/scan_sleep_millisecs"),
+ .max_ptes_none = thp_read_num("khugepaged/max_ptes_none"),
+ .max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"),
+ .max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"),
+ .pages_to_scan = thp_read_num("khugepaged/pages_to_scan"),
+ };
+ if (dev_queue_read_ahead_path[0])
+ settings->read_ahead_kb = read_num(dev_queue_read_ahead_path);
+
+ for (i = 0; i < NR_ORDERS; i++) {
+ if (!((1 << i) & orders)) {
+ settings->hugepages[i].enabled = THP_NEVER;
+ continue;
+ }
+ snprintf(path, PATH_MAX, "hugepages-%ukB/enabled",
+ (getpagesize() >> 10) << i);
+ settings->hugepages[i].enabled =
+ thp_read_string(path, thp_enabled_strings);
+ }
+
+ for (i = 0; i < NR_ORDERS; i++) {
+ if (!((1 << i) & shmem_orders)) {
+ settings->shmem_hugepages[i].enabled = SHMEM_NEVER;
+ continue;
+ }
+ snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
+ (getpagesize() >> 10) << i);
+ settings->shmem_hugepages[i].enabled =
+ thp_read_string(path, shmem_enabled_strings);
+ }
+}
+
+void thp_write_settings(struct thp_settings *settings)
+{
+ struct khugepaged_settings *khugepaged = &settings->khugepaged;
+ unsigned long orders = thp_supported_orders();
+ unsigned long shmem_orders = thp_shmem_supported_orders();
+ char path[PATH_MAX];
+ int enabled;
+ int i;
+
+ thp_write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+ thp_write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+ thp_write_string("shmem_enabled",
+ shmem_enabled_strings[settings->shmem_enabled]);
+ thp_write_num("use_zero_page", settings->use_zero_page);
+
+ thp_write_num("khugepaged/defrag", khugepaged->defrag);
+ thp_write_num("khugepaged/alloc_sleep_millisecs",
+ khugepaged->alloc_sleep_millisecs);
+ thp_write_num("khugepaged/scan_sleep_millisecs",
+ khugepaged->scan_sleep_millisecs);
+ thp_write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+ thp_write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+ thp_write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
+ thp_write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+
+ if (dev_queue_read_ahead_path[0])
+ write_num(dev_queue_read_ahead_path, settings->read_ahead_kb);
+
+ for (i = 0; i < NR_ORDERS; i++) {
+ if (!((1 << i) & orders))
+ continue;
+ snprintf(path, PATH_MAX, "hugepages-%ukB/enabled",
+ (getpagesize() >> 10) << i);
+ enabled = settings->hugepages[i].enabled;
+ thp_write_string(path, thp_enabled_strings[enabled]);
+ }
+
+ for (i = 0; i < NR_ORDERS; i++) {
+ if (!((1 << i) & shmem_orders))
+ continue;
+ snprintf(path, PATH_MAX, "hugepages-%ukB/shmem_enabled",
+ (getpagesize() >> 10) << i);
+ enabled = settings->shmem_hugepages[i].enabled;
+ thp_write_string(path, shmem_enabled_strings[enabled]);
+ }
+}
+
+struct thp_settings *thp_current_settings(void)
+{
+ if (!settings_index) {
+ printf("Fail: No settings set");
+ exit(EXIT_FAILURE);
+ }
+ return settings_stack + settings_index - 1;
+}
+
+void thp_push_settings(struct thp_settings *settings)
+{
+ if (settings_index >= MAX_SETTINGS_DEPTH) {
+ printf("Fail: Settings stack exceeded");
+ exit(EXIT_FAILURE);
+ }
+ settings_stack[settings_index++] = *settings;
+ thp_write_settings(thp_current_settings());
+}
+
+void thp_pop_settings(void)
+{
+ if (settings_index <= 0) {
+ printf("Fail: Settings stack empty");
+ exit(EXIT_FAILURE);
+ }
+ --settings_index;
+ thp_write_settings(thp_current_settings());
+}
+
+void thp_restore_settings(void)
+{
+ thp_write_settings(&saved_settings);
+}
+
+void thp_save_settings(void)
+{
+ thp_read_settings(&saved_settings);
+}
+
+void thp_set_read_ahead_path(char *path)
+{
+ if (!path) {
+ dev_queue_read_ahead_path[0] = '\0';
+ return;
+ }
+
+ strncpy(dev_queue_read_ahead_path, path,
+ sizeof(dev_queue_read_ahead_path));
+ dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0';
+}
+
+static unsigned long __thp_supported_orders(bool is_shmem)
+{
+ unsigned long orders = 0;
+ char path[PATH_MAX];
+ char buf[256];
+ int ret, i;
+ char anon_dir[] = "enabled";
+ char shmem_dir[] = "shmem_enabled";
+
+ for (i = 0; i < NR_ORDERS; i++) {
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/%s",
+ (getpagesize() >> 10) << i, is_shmem ? shmem_dir : anon_dir);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ ret = read_file(path, buf, sizeof(buf));
+ if (ret)
+ orders |= 1UL << i;
+ }
+
+ return orders;
+}
+
+unsigned long thp_supported_orders(void)
+{
+ return __thp_supported_orders(false);
+}
+
+unsigned long thp_shmem_supported_orders(void)
+{
+ return __thp_supported_orders(true);
+}
diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h
new file mode 100644
index 000000000000..fc131d23d593
--- /dev/null
+++ b/tools/testing/selftests/mm/thp_settings.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __THP_SETTINGS_H__
+#define __THP_SETTINGS_H__
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+enum thp_enabled {
+ THP_NEVER,
+ THP_ALWAYS,
+ THP_INHERIT,
+ THP_MADVISE,
+};
+
+enum thp_defrag {
+ THP_DEFRAG_ALWAYS,
+ THP_DEFRAG_DEFER,
+ THP_DEFRAG_DEFER_MADVISE,
+ THP_DEFRAG_MADVISE,
+ THP_DEFRAG_NEVER,
+};
+
+enum shmem_enabled {
+ SHMEM_NEVER,
+ SHMEM_ALWAYS,
+ SHMEM_WITHIN_SIZE,
+ SHMEM_ADVISE,
+ SHMEM_INHERIT,
+ SHMEM_DENY,
+ SHMEM_FORCE,
+};
+
+#define NR_ORDERS 20
+
+struct hugepages_settings {
+ enum thp_enabled enabled;
+};
+
+struct khugepaged_settings {
+ bool defrag;
+ unsigned int alloc_sleep_millisecs;
+ unsigned int scan_sleep_millisecs;
+ unsigned int max_ptes_none;
+ unsigned int max_ptes_swap;
+ unsigned int max_ptes_shared;
+ unsigned long pages_to_scan;
+};
+
+struct shmem_hugepages_settings {
+ enum shmem_enabled enabled;
+};
+
+struct thp_settings {
+ enum thp_enabled thp_enabled;
+ enum thp_defrag thp_defrag;
+ enum shmem_enabled shmem_enabled;
+ bool use_zero_page;
+ struct khugepaged_settings khugepaged;
+ unsigned long read_ahead_kb;
+ struct hugepages_settings hugepages[NR_ORDERS];
+ struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS];
+};
+
+int read_file(const char *path, char *buf, size_t buflen);
+int write_file(const char *path, const char *buf, size_t buflen);
+unsigned long read_num(const char *path);
+void write_num(const char *path, unsigned long num);
+
+int thp_read_string(const char *name, const char * const strings[]);
+void thp_write_string(const char *name, const char *val);
+unsigned long thp_read_num(const char *name);
+void thp_write_num(const char *name, unsigned long num);
+
+void thp_write_settings(struct thp_settings *settings);
+void thp_read_settings(struct thp_settings *settings);
+struct thp_settings *thp_current_settings(void);
+void thp_push_settings(struct thp_settings *settings);
+void thp_pop_settings(void);
+void thp_restore_settings(void);
+void thp_save_settings(void);
+
+void thp_set_read_ahead_path(char *path);
+unsigned long thp_supported_orders(void);
+unsigned long thp_shmem_supported_orders(void);
+
+#endif /* __THP_SETTINGS_H__ */
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
new file mode 100644
index 000000000000..95b6f043a3cb
--- /dev/null
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test selecting other page sizes for mmap/shmget.
+
+ Before running this huge pages for each huge page size must have been
+ reserved.
+ For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must
+ be used. 1GB wouldn't be tested if it isn't available.
+ Also shmmax must be increased.
+ And you need to run as root to work around some weird permissions in shm.
+ And nothing using huge pages should run in parallel.
+ When the program aborts you may need to clean up the shm segments with
+ ipcrm -m by hand, like this
+ sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
+ (warning this will remove all if someone else uses them) */
+
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+
+#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
+#ifndef SHM_HUGE_SHIFT
+#define SHM_HUGE_SHIFT 26
+#endif
+#ifndef SHM_HUGE_MASK
+#define SHM_HUGE_MASK 0x3f
+#endif
+#ifndef SHM_HUGE_2MB
+#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
+#endif
+#ifndef SHM_HUGE_1GB
+#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
+#endif
+
+#define NUM_PAGESIZES 5
+#define NUM_PAGES 4
+
+unsigned long page_sizes[NUM_PAGESIZES];
+int num_page_sizes;
+
+int ilog2(unsigned long v)
+{
+ int l = 0;
+ while ((1UL << l) < v)
+ l++;
+ return l;
+}
+
+void show(unsigned long ps)
+{
+ char buf[100];
+
+ if (ps == getpagesize())
+ return;
+
+ ksft_print_msg("%luMB: ", ps >> 20);
+
+ fflush(stdout);
+ snprintf(buf, sizeof buf,
+ "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
+ system(buf);
+}
+
+unsigned long thuge_read_sysfs(int warn, char *fmt, ...)
+{
+ char *line = NULL;
+ size_t linelen = 0;
+ char buf[100];
+ FILE *f;
+ va_list ap;
+ unsigned long val = 0;
+
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof buf, fmt, ap);
+ va_end(ap);
+
+ f = fopen(buf, "r");
+ if (!f) {
+ if (warn)
+ ksft_print_msg("missing %s\n", buf);
+ return 0;
+ }
+ if (getline(&line, &linelen, f) > 0) {
+ sscanf(line, "%lu", &val);
+ }
+ fclose(f);
+ free(line);
+ return val;
+}
+
+unsigned long read_free(unsigned long ps)
+{
+ return thuge_read_sysfs(ps != getpagesize(),
+ "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
+}
+
+void test_mmap(unsigned long size, unsigned flags)
+{
+ char *map;
+ unsigned long before, after;
+
+ before = read_free(size);
+ map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("mmap: %s\n", strerror(errno));
+
+ memset(map, 0xff, size*NUM_PAGES);
+ after = read_free(size);
+
+ show(size);
+ ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+ "%s mmap %lu %x\n", __func__, size, flags);
+
+ if (munmap(map, size * NUM_PAGES))
+ ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
+}
+
+void test_shmget(unsigned long size, unsigned flags)
+{
+ int id;
+ unsigned long before, after;
+ struct shm_info i;
+ char *map;
+
+ before = read_free(size);
+ id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
+ if (id < 0) {
+ if (errno == EPERM) {
+ ksft_test_result_skip("shmget requires root privileges: %s\n",
+ strerror(errno));
+ return;
+ }
+ ksft_exit_fail_msg("shmget: %s\n", strerror(errno));
+ }
+
+ if (shmctl(id, SHM_INFO, (void *)&i) < 0)
+ ksft_exit_fail_msg("shmctl: %s\n", strerror(errno));
+
+ map = shmat(id, NULL, 0600);
+ if (map == MAP_FAILED)
+ ksft_exit_fail_msg("shmat: %s\n", strerror(errno));
+
+ shmctl(id, IPC_RMID, NULL);
+
+ memset(map, 0xff, size*NUM_PAGES);
+ after = read_free(size);
+
+ show(size);
+ ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
+ "%s: mmap %lu %x\n", __func__, size, flags);
+ if (shmdt(map))
+ ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
+}
+
+void find_pagesizes(void)
+{
+ unsigned long largest = getpagesize();
+ int i;
+ glob_t g;
+
+ glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+ assert(g.gl_pathc <= NUM_PAGESIZES);
+ for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) {
+ sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+ &page_sizes[num_page_sizes]);
+ page_sizes[num_page_sizes] <<= 10;
+ ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20);
+
+ if (page_sizes[num_page_sizes] > largest)
+ largest = page_sizes[i];
+
+ if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES)
+ num_page_sizes++;
+ else
+ ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n",
+ page_sizes[num_page_sizes] >> 20, NUM_PAGES);
+ }
+ globfree(&g);
+
+ if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest)
+ ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax",
+ largest * NUM_PAGES);
+
+#if defined(__x86_64__)
+ if (largest != 1U<<30) {
+ ksft_exit_fail_msg("No GB pages available on x86-64\n"
+ "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
+ }
+#endif
+}
+
+int main(void)
+{
+ unsigned default_hps = default_huge_page_size();
+ int i;
+
+ ksft_print_header();
+
+ find_pagesizes();
+
+ if (!num_page_sizes)
+ ksft_finished();
+
+ ksft_set_plan(2 * num_page_sizes + 3);
+
+ for (i = 0; i < num_page_sizes; i++) {
+ unsigned long ps = page_sizes[i];
+ int arg = ilog2(ps) << MAP_HUGE_SHIFT;
+
+ ksft_print_msg("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+ test_mmap(ps, MAP_HUGETLB | arg);
+ }
+
+ ksft_print_msg("Testing default huge mmap\n");
+ test_mmap(default_hps, MAP_HUGETLB);
+
+ ksft_print_msg("Testing non-huge shmget\n");
+ test_shmget(getpagesize(), 0);
+
+ for (i = 0; i < num_page_sizes; i++) {
+ unsigned long ps = page_sizes[i];
+ int arg = ilog2(ps) << SHM_HUGE_SHIFT;
+ ksft_print_msg("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+ test_shmget(ps, SHM_HUGETLB | arg);
+ }
+
+ ksft_print_msg("default huge shmget\n");
+ test_shmget(default_hps, SHM_HUGETLB);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
new file mode 100644
index 000000000000..68201192e37c
--- /dev/null
+++ b/tools/testing/selftests/mm/transhuge-stress.c
@@ -0,0 +1,138 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+int backing_fd = -1;
+int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
+#define PROT_RW (PROT_READ | PROT_WRITE)
+
+int main(int argc, char **argv)
+{
+ size_t ram, len;
+ void *ptr, *p;
+ struct timespec start, a, b;
+ int i = 0;
+ char *name = NULL;
+ double s;
+ uint8_t *map;
+ size_t map_len;
+ int pagemap_fd;
+ int duration = 0;
+
+ ksft_print_header();
+
+ ram = sysconf(_SC_PHYS_PAGES);
+ if (ram > SIZE_MAX / psize() / 4)
+ ram = SIZE_MAX / 4;
+ else
+ ram *= psize();
+ len = ram;
+
+ while (++i < argc) {
+ if (!strcmp(argv[i], "-h"))
+ ksft_exit_fail_msg("usage: %s [-f <filename>] [-d <duration>] [size in MiB]\n",
+ argv[0]);
+ else if (!strcmp(argv[i], "-f"))
+ name = argv[++i];
+ else if (!strcmp(argv[i], "-d"))
+ duration = atoi(argv[++i]);
+ else
+ len = atoll(argv[i]) << 20;
+ }
+
+ ksft_set_plan(1);
+
+ if (name) {
+ backing_fd = open(name, O_RDWR);
+ if (backing_fd == -1)
+ ksft_exit_fail_msg("open %s\n", name);
+ mmap_flags = MAP_SHARED;
+ }
+
+ warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+ " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+ ram >> (20 + HPAGE_SHIFT - pshift() - 1));
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("open pagemap\n");
+
+ len -= len % HPAGE_SIZE;
+ ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
+ if (ptr == MAP_FAILED)
+ ksft_exit_fail_msg("initial mmap");
+ ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+ if (madvise(ptr, len, MADV_HUGEPAGE))
+ ksft_exit_fail_msg("MADV_HUGEPAGE");
+
+ map_len = ram >> (HPAGE_SHIFT - 1);
+ map = malloc(map_len);
+ if (!map)
+ ksft_exit_fail_msg("map malloc\n");
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ while (1) {
+ int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+ memset(map, 0, map_len);
+
+ clock_gettime(CLOCK_MONOTONIC, &a);
+ for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+ int64_t pfn;
+
+ pfn = allocate_transhuge(p, pagemap_fd);
+
+ if (pfn < 0) {
+ nr_failed++;
+ } else {
+ size_t idx = pfn >> (HPAGE_SHIFT - pshift());
+
+ nr_succeed++;
+ if (idx >= map_len) {
+ map = realloc(map, idx + 1);
+ if (!map)
+ ksft_exit_fail_msg("map realloc\n");
+ memset(map + map_len, 0, idx + 1 - map_len);
+ map_len = idx + 1;
+ }
+ if (!map[idx])
+ nr_pages++;
+ map[idx] = 1;
+ }
+
+ /* split transhuge page, keep last page */
+ if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED))
+ ksft_exit_fail_msg("MADV_DONTNEED");
+ }
+ clock_gettime(CLOCK_MONOTONIC, &b);
+ s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+ ksft_print_msg("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+ "%4d succeed, %4d failed, %4d different pages\n",
+ s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+ nr_succeed, nr_failed, nr_pages);
+
+ if (duration > 0 && b.tv_sec - start.tv_sec >= duration) {
+ ksft_test_result_pass("Completed\n");
+ ksft_finished();
+ }
+ }
+}
diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c
new file mode 100644
index 000000000000..a37088a23ffe
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-common.c
@@ -0,0 +1,718 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd tests util functions
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+
+#include "uffd-common.h"
+
+#define BASE_PMD_ADDR ((void *)(1UL << 30))
+
+volatile bool test_uffdio_copy_eexist = true;
+unsigned long nr_parallel, nr_pages, nr_pages_per_cpu, page_size;
+char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+int uffd = -1, uffd_flags, finished, *pipefd, test_type;
+bool map_shared;
+bool test_uffdio_wp = true;
+unsigned long long *count_verify;
+uffd_test_ops_t *uffd_test_ops;
+uffd_test_case_ops_t *uffd_test_case_ops;
+atomic_bool ready_for_fork;
+
+static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
+{
+ unsigned int memfd_flags = 0;
+ int mem_fd;
+
+ if (hugetlb)
+ memfd_flags = MFD_HUGETLB;
+ mem_fd = memfd_create("uffd-test", memfd_flags);
+ if (mem_fd < 0)
+ err("memfd_create");
+ if (ftruncate(mem_fd, mem_size))
+ err("ftruncate");
+ if (fallocate(mem_fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+ mem_size))
+ err("fallocate");
+
+ return mem_fd;
+}
+
+static void anon_release_pages(char *rel_area)
+{
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+}
+
+static int anon_allocate_area(void **alloc_area, bool is_src)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ *alloc_area = NULL;
+ return -errno;
+ }
+ return 0;
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+static void hugetlb_release_pages(char *rel_area)
+{
+ if (!map_shared) {
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+ } else {
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
+ }
+}
+
+static int hugetlb_allocate_area(void **alloc_area, bool is_src)
+{
+ off_t size = nr_pages * page_size;
+ off_t offset = is_src ? 0 : size;
+ void *area_alias = NULL;
+ char **alloc_area_alias;
+ int mem_fd = uffd_mem_fd_create(size * 2, true);
+
+ *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ (is_src ? 0 : MAP_NORESERVE),
+ mem_fd, offset);
+ if (*alloc_area == MAP_FAILED) {
+ *alloc_area = NULL;
+ return -errno;
+ }
+
+ if (map_shared) {
+ area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, mem_fd, offset);
+ if (area_alias == MAP_FAILED)
+ return -errno;
+ }
+
+ if (is_src) {
+ alloc_area_alias = &area_src_alias;
+ } else {
+ alloc_area_alias = &area_dst_alias;
+ }
+ if (area_alias)
+ *alloc_area_alias = area_alias;
+
+ close(mem_fd);
+ return 0;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ if (!map_shared)
+ return;
+
+ *start = (unsigned long) area_dst_alias + offset;
+}
+
+static void shmem_release_pages(char *rel_area)
+{
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
+}
+
+static int shmem_allocate_area(void **alloc_area, bool is_src)
+{
+ void *area_alias = NULL;
+ size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
+ unsigned long offset = is_src ? 0 : bytes;
+ char *p = NULL, *p_alias = NULL;
+ int mem_fd = uffd_mem_fd_create(bytes * 2, false);
+
+ /* TODO: clean this up. Use a static addr is ugly */
+ p = BASE_PMD_ADDR;
+ if (!is_src)
+ /* src map + alias + interleaved hpages */
+ p += 2 * (bytes + hpage_size);
+ p_alias = p;
+ p_alias += bytes;
+ p_alias += hpage_size; /* Prevent src/dst VMA merge */
+
+ *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_fd, offset);
+ if (*alloc_area == MAP_FAILED) {
+ *alloc_area = NULL;
+ return -errno;
+ }
+ if (*alloc_area != p)
+ err("mmap of memfd failed at %p", p);
+
+ area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_fd, offset);
+ if (area_alias == MAP_FAILED) {
+ munmap(*alloc_area, bytes);
+ *alloc_area = NULL;
+ return -errno;
+ }
+ if (area_alias != p_alias)
+ err("mmap of anonymous memory failed at %p", p_alias);
+
+ if (is_src)
+ area_src_alias = area_alias;
+ else
+ area_dst_alias = area_alias;
+
+ close(mem_fd);
+ return 0;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ *start = (unsigned long)area_dst_alias + offset;
+}
+
+static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
+{
+ if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
+ read_pmd_pagesize()))
+ err("Did not find expected %d number of hugepages",
+ expect_nr_hpages);
+}
+
+struct uffd_test_ops anon_uffd_test_ops = {
+ .allocate_area = anon_allocate_area,
+ .release_pages = anon_release_pages,
+ .alias_mapping = noop_alias_mapping,
+ .check_pmd_mapping = NULL,
+};
+
+struct uffd_test_ops shmem_uffd_test_ops = {
+ .allocate_area = shmem_allocate_area,
+ .release_pages = shmem_release_pages,
+ .alias_mapping = shmem_alias_mapping,
+ .check_pmd_mapping = shmem_check_pmd_mapping,
+};
+
+struct uffd_test_ops hugetlb_uffd_test_ops = {
+ .allocate_area = hugetlb_allocate_area,
+ .release_pages = hugetlb_release_pages,
+ .alias_mapping = hugetlb_alias_mapping,
+ .check_pmd_mapping = NULL,
+};
+
+void uffd_stats_report(struct uffd_args *args, int n_cpus)
+{
+ int i;
+ unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
+
+ for (i = 0; i < n_cpus; i++) {
+ miss_total += args[i].missing_faults;
+ wp_total += args[i].wp_faults;
+ minor_total += args[i].minor_faults;
+ }
+
+ printf("userfaults: ");
+ if (miss_total) {
+ printf("%llu missing (", miss_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", args[i].missing_faults);
+ printf("\b) ");
+ }
+ if (wp_total) {
+ printf("%llu wp (", wp_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", args[i].wp_faults);
+ printf("\b) ");
+ }
+ if (minor_total) {
+ printf("%llu minor (", minor_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", args[i].minor_faults);
+ printf("\b)");
+ }
+ printf("\n");
+}
+
+int userfaultfd_open(uint64_t *features)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = uffd_open(UFFD_FLAGS);
+ if (uffd < 0)
+ return -1;
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = *features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+ /* Probably lack of CAP_PTRACE? */
+ return -1;
+ if (uffdio_api.api != UFFD_API)
+ err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
+
+ *features = uffdio_api.features;
+ return 0;
+}
+
+static inline void munmap_area(void **area)
+{
+ if (*area)
+ if (munmap(*area, nr_pages * page_size))
+ err("munmap");
+
+ *area = NULL;
+}
+
+void uffd_test_ctx_clear(void)
+{
+ size_t i;
+
+ if (pipefd) {
+ for (i = 0; i < nr_parallel * 2; ++i) {
+ if (close(pipefd[i]))
+ err("close pipefd");
+ }
+ free(pipefd);
+ pipefd = NULL;
+ }
+
+ if (count_verify) {
+ free(count_verify);
+ count_verify = NULL;
+ }
+
+ if (uffd != -1) {
+ if (close(uffd))
+ err("close uffd");
+ uffd = -1;
+ }
+
+ munmap_area((void **)&area_src);
+ munmap_area((void **)&area_src_alias);
+ munmap_area((void **)&area_dst);
+ munmap_area((void **)&area_dst_alias);
+ munmap_area((void **)&area_remap);
+}
+
+int uffd_test_ctx_init(uint64_t features, const char **errmsg)
+{
+ unsigned long nr, cpu;
+ int ret;
+
+ if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
+ ret = uffd_test_case_ops->pre_alloc(errmsg);
+ if (ret)
+ return ret;
+ }
+
+ ret = uffd_test_ops->allocate_area((void **)&area_src, true);
+ ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
+ if (ret) {
+ if (errmsg)
+ *errmsg = "memory allocation failed";
+ return ret;
+ }
+
+ if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
+ ret = uffd_test_case_ops->post_alloc(errmsg);
+ if (ret)
+ return ret;
+ }
+
+ ret = userfaultfd_open(&features);
+ if (ret) {
+ if (errmsg)
+ *errmsg = "possible lack of privilege";
+ return ret;
+ }
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify)
+ err("count_verify");
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) =
+ (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ /*
+ * After initialization of area_src, we must explicitly release pages
+ * for area_dst to make sure it's fully empty. Otherwise we could have
+ * some area_dst pages be erroneously initialized with zero pages,
+ * hence we could hit memory corruption later in the test.
+ *
+ * One example is when THP is globally enabled, above allocate_area()
+ * calls could have the two areas merged into a single VMA (as they
+ * will have the same VMA flags so they're mergeable). When we
+ * initialize the area_src above, it's possible that some part of
+ * area_dst could have been faulted in via one huge THP that will be
+ * shared between area_src and area_dst. It could cause some of the
+ * area_dst won't be trapped by missing userfaults.
+ *
+ * This release_pages() will guarantee even if that happened, we'll
+ * proactively split the thp and drop any accidentally initialized
+ * pages within area_dst.
+ */
+ uffd_test_ops->release_pages(area_dst);
+
+ pipefd = malloc(sizeof(int) * nr_parallel * 2);
+ if (!pipefd)
+ err("pipefd");
+ for (cpu = 0; cpu < nr_parallel; cpu++)
+ if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
+ err("pipe");
+
+ return 0;
+}
+
+void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_writeprotect prms;
+
+ /* Write protection page faults */
+ prms.range.start = start;
+ prms.range.len = len;
+ /* Undo write-protect, do wakeup after that */
+ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+ if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+ err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
+}
+
+static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_continue req;
+ int ret;
+
+ req.range.start = start;
+ req.range.len = len;
+ req.mode = 0;
+ if (wp)
+ req.mode |= UFFDIO_CONTINUE_MODE_WP;
+
+ if (ioctl(ufd, UFFDIO_CONTINUE, &req))
+ err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
+ (uint64_t)start);
+
+ /*
+ * Error handling within the kernel for continue is subtly different
+ * from copy or zeropage, so it may be a source of bugs. Trigger an
+ * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
+ */
+ req.mapped = 0;
+ ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
+ if (ret >= 0 || req.mapped != -EEXIST)
+ err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
+ ret, (int64_t) req.mapped);
+}
+
+int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+ int ret = read(uffd, msg, sizeof(*msg));
+
+ if (ret != sizeof(*msg)) {
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ return 1;
+ err("blocking read error");
+ } else {
+ err("short read");
+ }
+ }
+
+ return 0;
+}
+
+void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
+
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+ /* Write protect page faults */
+ wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+ args->wp_faults++;
+ } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+ uint8_t *area;
+ int b;
+
+ /*
+ * Minor page faults
+ *
+ * To prove we can modify the original range for testing
+ * purposes, we're going to bit flip this range before
+ * continuing.
+ *
+ * Note that this requires all minor page fault tests operate on
+ * area_dst (non-UFFD-registered) and area_dst_alias
+ * (UFFD-registered).
+ */
+
+ area = (uint8_t *)(area_dst +
+ ((char *)msg->arg.pagefault.address -
+ area_dst_alias));
+ for (b = 0; b < page_size; ++b)
+ area[b] = ~area[b];
+ continue_range(uffd, msg->arg.pagefault.address, page_size,
+ args->apply_wp);
+ args->minor_faults++;
+ } else {
+ /*
+ * Missing page faults.
+ *
+ * Here we force a write check for each of the missing mode
+ * faults. It's guaranteed because the only threads that
+ * will trigger uffd faults are the locking threads, and
+ * their first instruction to touch the missing page will
+ * always be pthread_mutex_lock().
+ *
+ * Note that here we relied on an NPTL glibc impl detail to
+ * always read the lock type at the entry of the lock op
+ * (pthread_mutex_t.__data.__type, offset 0x10) before
+ * doing any locking operations to guarantee that. It's
+ * actually not good to rely on this impl detail because
+ * logically a pthread-compatible lib can implement the
+ * locks without types and we can fail when linking with
+ * them. However since we used to find bugs with this
+ * strict check we still keep it around. Hopefully this
+ * could be a good hint when it fails again. If one day
+ * it'll break on some other impl of glibc we'll revisit.
+ */
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ err("unexpected write fault");
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ if (copy_page(uffd, offset, args->apply_wp))
+ args->missing_faults++;
+ }
+}
+
+void *uffd_poll_thread(void *arg)
+{
+ struct uffd_args *args = (struct uffd_args *)arg;
+ unsigned long cpu = args->cpu;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ struct uffdio_register uffd_reg;
+ int ret;
+ char tmp_chr;
+
+ if (!args->handle_fault)
+ args->handle_fault = uffd_handle_page_fault;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ ready_for_fork = true;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (ret <= 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ err("poll error: %d", ret);
+ }
+ if (pollfd[1].revents) {
+ if (!(pollfd[1].revents & POLLIN))
+ err("pollfd[1].revents %d", pollfd[1].revents);
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ err("read pipefd error");
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ err("pollfd[0].revents %d", pollfd[0].revents);
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ switch (msg.event) {
+ default:
+ err("unexpected msg event %u\n", msg.event);
+ break;
+ case UFFD_EVENT_PAGEFAULT:
+ args->handle_fault(&msg, args);
+ break;
+ case UFFD_EVENT_FORK:
+ close(uffd);
+ uffd = msg.arg.fork.ufd;
+ pollfd[0].fd = uffd;
+ break;
+ case UFFD_EVENT_REMOVE:
+ uffd_reg.range.start = msg.arg.remove.start;
+ uffd_reg.range.len = msg.arg.remove.end -
+ msg.arg.remove.start;
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+ err("remove failure");
+ break;
+ case UFFD_EVENT_REMAP:
+ area_remap = area_dst; /* save for later unmap */
+ area_dst = (char *)(unsigned long)msg.arg.remap.to;
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+ uffdio_copy->len,
+ offset);
+ if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy->copy != -EEXIST)
+ err("UFFDIO_COPY retry error: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ } else {
+ err("UFFDIO_COPY retry unexpected: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ }
+}
+
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+ struct uffdio_range uffdio_wake;
+
+ uffdio_wake.start = addr;
+ uffdio_wake.len = len;
+
+ if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+ fprintf(stderr, "error waking %lu\n",
+ addr), exit(1);
+}
+
+int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ err("unexpected offset %lu\n", offset);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ if (wp)
+ uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+ else
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ err("UFFDIO_COPY error: %"PRId64,
+ (int64_t)uffdio_copy.copy);
+ wake_range(ufd, uffdio_copy.dst, page_size);
+ } else if (uffdio_copy.copy != page_size) {
+ err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
+ } else {
+ if (test_uffdio_copy_eexist && retry) {
+ test_uffdio_copy_eexist = false;
+ retry_copy_page(ufd, &uffdio_copy, offset);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int copy_page(int ufd, unsigned long offset, bool wp)
+{
+ return __copy_page(ufd, offset, false, wp);
+}
+
+int move_page(int ufd, unsigned long offset, unsigned long len)
+{
+ struct uffdio_move uffdio_move;
+
+ if (offset + len > nr_pages * page_size)
+ err("unexpected offset %lu and length %lu\n", offset, len);
+ uffdio_move.dst = (unsigned long) area_dst + offset;
+ uffdio_move.src = (unsigned long) area_src + offset;
+ uffdio_move.len = len;
+ uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
+ uffdio_move.move = 0;
+ if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) {
+ /* real retval in uffdio_move.move */
+ if (uffdio_move.move != -EEXIST)
+ err("UFFDIO_MOVE error: %"PRId64,
+ (int64_t)uffdio_move.move);
+ wake_range(ufd, uffdio_move.dst, len);
+ } else if (uffdio_move.move != len) {
+ err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
+ } else
+ return 1;
+ return 0;
+}
+
+int uffd_open_dev(unsigned int flags)
+{
+ int fd, uffd;
+
+ fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+ uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
+ close(fd);
+
+ return uffd;
+}
+
+int uffd_open_sys(unsigned int flags)
+{
+#ifdef __NR_userfaultfd
+ return syscall(__NR_userfaultfd, flags);
+#else
+ return -1;
+#endif
+}
+
+int uffd_open(unsigned int flags)
+{
+ int uffd = uffd_open_sys(flags);
+
+ if (uffd < 0)
+ uffd = uffd_open_dev(flags);
+
+ return uffd;
+}
+
+int uffd_get_features(uint64_t *features)
+{
+ struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
+ /*
+ * This should by default work in most kernels; the feature list
+ * will be the same no matter what we pass in here.
+ */
+ int fd = uffd_open(UFFD_USER_MODE_ONLY);
+
+ if (fd < 0)
+ /* Maybe the kernel is older than user-only mode? */
+ fd = uffd_open(0);
+
+ if (fd < 0)
+ return fd;
+
+ if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
+ close(fd);
+ return -errno;
+ }
+
+ *features = uffdio_api.features;
+ close(fd);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h
new file mode 100644
index 000000000000..7700cbfa3975
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-common.h
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd tests common header
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+#ifndef __UFFD_COMMON_H__
+#define __UFFD_COMMON_H__
+
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__ // Use ll64
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <sys/random.h>
+#include <stdatomic.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+#define _err(fmt, ...) \
+ do { \
+ int ret = errno; \
+ fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
+ fprintf(stderr, " (errno=%d, @%s:%d)\n", \
+ ret, __FILE__, __LINE__); \
+ } while (0)
+
+#define errexit(exitcode, fmt, ...) \
+ do { \
+ _err(fmt, ##__VA_ARGS__); \
+ exit(exitcode); \
+ } while (0)
+
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+/* Userfaultfd test statistics */
+struct uffd_args {
+ int cpu;
+ /* Whether apply wr-protects when installing pages */
+ bool apply_wp;
+ unsigned long missing_faults;
+ unsigned long wp_faults;
+ unsigned long minor_faults;
+
+ /* A custom fault handler; defaults to uffd_handle_page_fault. */
+ void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args);
+};
+
+struct uffd_test_ops {
+ int (*allocate_area)(void **alloc_area, bool is_src);
+ void (*release_pages)(char *rel_area);
+ void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+ void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
+};
+typedef struct uffd_test_ops uffd_test_ops_t;
+
+struct uffd_test_case_ops {
+ int (*pre_alloc)(const char **errmsg);
+ int (*post_alloc)(const char **errmsg);
+};
+typedef struct uffd_test_case_ops uffd_test_case_ops_t;
+
+extern unsigned long nr_parallel, nr_pages, nr_pages_per_cpu, page_size;
+extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+extern int uffd, uffd_flags, finished, *pipefd, test_type;
+extern bool map_shared;
+extern bool test_uffdio_wp;
+extern unsigned long long *count_verify;
+extern volatile bool test_uffdio_copy_eexist;
+extern atomic_bool ready_for_fork;
+
+extern uffd_test_ops_t anon_uffd_test_ops;
+extern uffd_test_ops_t shmem_uffd_test_ops;
+extern uffd_test_ops_t hugetlb_uffd_test_ops;
+extern uffd_test_ops_t *uffd_test_ops;
+extern uffd_test_case_ops_t *uffd_test_case_ops;
+
+void uffd_stats_report(struct uffd_args *args, int n_cpus);
+int uffd_test_ctx_init(uint64_t features, const char **errmsg);
+void uffd_test_ctx_clear(void);
+int userfaultfd_open(uint64_t *features);
+int uffd_read_msg(int ufd, struct uffd_msg *msg);
+void wp_range(int ufd, __u64 start, __u64 len, bool wp);
+void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args);
+int __copy_page(int ufd, unsigned long offset, bool retry, bool wp);
+int copy_page(int ufd, unsigned long offset, bool wp);
+int move_page(int ufd, unsigned long offset, unsigned long len);
+void *uffd_poll_thread(void *arg);
+
+int uffd_open_dev(unsigned int flags);
+int uffd_open_sys(unsigned int flags);
+int uffd_open(unsigned int flags);
+int uffd_get_features(uint64_t *features);
+
+#define TEST_ANON 1
+#define TEST_HUGETLB 2
+#define TEST_SHMEM 3
+
+#endif
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
new file mode 100644
index 000000000000..40af7f67c407
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ */
+
+#include "uffd-common.h"
+
+uint64_t features;
+#ifdef __NR_userfaultfd
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static char *zeropage;
+pthread_attr_t attr;
+
+#define swap(a, b) \
+ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+const char *examples =
+ "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+ "./uffd-stress anon 100 99999\n\n"
+ "# Run share memory test on 1GiB region with 99 bounces:\n"
+ "./uffd-stress shmem 1000 99\n\n"
+ "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+ "./uffd-stress hugetlb 256 50\n\n"
+ "# Run the same hugetlb test but using private file:\n"
+ "./uffd-stress hugetlb-private 256 50\n\n"
+ "# 10MiB-~6GiB 999 bounces anonymous test, "
+ "continue forever unless an error triggers\n"
+ "while ./uffd-stress anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+
+static void usage(void)
+{
+ fprintf(stderr, "\nUsage: ./uffd-stress <test type> <MiB> <bounces>\n\n");
+ fprintf(stderr, "Supported <test type>: anon, hugetlb, "
+ "hugetlb-private, shmem, shmem-private\n\n");
+ fprintf(stderr, "Examples:\n\n");
+ fprintf(stderr, "%s", examples);
+ exit(1);
+}
+
+static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus)
+{
+ int i;
+
+ for (i = 0; i < n_cpus; i++) {
+ args[i].cpu = i;
+ args[i].apply_wp = test_uffdio_wp;
+ args[i].missing_faults = 0;
+ args[i].wp_faults = 0;
+ args[i].minor_faults = 0;
+ }
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr;
+ unsigned long long count;
+
+ if (!(bounces & BOUNCE_RANDOM)) {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
+ err("getrandom failed");
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr])
+ err("page_nr %lu memory corruption %llu %llu",
+ page_nr, count, count_verify[page_nr]);
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+ }
+
+ return NULL;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, true, test_uffdio_wp);
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ struct uffd_args *args = (struct uffd_args *)arg;
+ struct uffd_msg msg;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ uffd_handle_page_fault(&msg, args);
+ }
+
+ return NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+ start_nr = cpu * nr_pages_per_cpu;
+ end_nr = (cpu+1) * nr_pages_per_cpu;
+ mid_nr = (start_nr + end_nr) / 2;
+
+ /* Copy the first half of the pages */
+ for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ /*
+ * If we need to test uffd-wp, set it up now. Then we'll have
+ * at least the first half of the pages mapped already which
+ * can be write-protected for testing
+ */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+ nr_pages_per_cpu * page_size, true);
+
+ /*
+ * Continue the 2nd half of the page copying, handling write
+ * protection faults if any
+ */
+ for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(struct uffd_args *args)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_parallel];
+ pthread_t uffd_threads[nr_parallel];
+ pthread_t background_threads[nr_parallel];
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_parallel; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, &args[cpu]))
+ err("uffd_poll_thread create");
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ (void *)&args[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_parallel; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ uffd_test_ops->release_pages(area_src);
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_parallel; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ for (cpu = 0; cpu < nr_parallel; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1)
+ err("pipefd write error");
+ if (pthread_join(uffd_threads[cpu],
+ (void *)&args[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ unsigned long nr;
+ struct uffd_args args[nr_parallel];
+ uint64_t mem_size = nr_pages * page_size;
+ int flags = 0;
+
+ memset(args, 0, sizeof(struct uffd_args) * nr_parallel);
+
+ if (features & UFFD_FEATURE_WP_UNPOPULATED && test_type == TEST_ANON)
+ flags = UFFD_FEATURE_WP_UNPOPULATED;
+
+ if (uffd_test_ctx_init(flags, NULL))
+ err("context init failed");
+
+ if (posix_memalign(&area, page_size, page_size))
+ err("out of memory");
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ while (bounces--) {
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ else
+ printf(" read");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ if (uffd_register(uffd, area_dst, mem_size,
+ true, test_uffdio_wp, false))
+ err("register failure");
+
+ if (area_dst_alias) {
+ if (uffd_register(uffd, area_dst_alias, mem_size,
+ true, test_uffdio_wp, false))
+ err("register failure alias");
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ uffd_test_ops->release_pages(area_dst);
+
+ uffd_stats_reset(args, nr_parallel);
+
+ /* bounce pass */
+ if (stress(args)) {
+ uffd_test_ctx_clear();
+ return 1;
+ }
+
+ /* Clear all the write protections if there is any */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst,
+ nr_pages * page_size, false);
+
+ /* unregister */
+ if (uffd_unregister(uffd, area_dst, mem_size))
+ err("unregister failure");
+ if (area_dst_alias) {
+ if (uffd_unregister(uffd, area_dst_alias, mem_size))
+ err("unregister failure alias");
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY)
+ for (nr = 0; nr < nr_pages; nr++)
+ if (*area_count(area_dst, nr) != count_verify[nr])
+ err("error area_count %llu %llu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr], nr);
+
+ /* prepare next bounce */
+ swap(area_src, area_dst);
+
+ swap(area_src_alias, area_dst_alias);
+
+ uffd_stats_report(args, nr_parallel);
+ }
+ uffd_test_ctx_clear();
+
+ return 0;
+}
+
+static void set_test_type(const char *type)
+{
+ if (!strcmp(type, "anon")) {
+ test_type = TEST_ANON;
+ uffd_test_ops = &anon_uffd_test_ops;
+ } else if (!strcmp(type, "hugetlb")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ map_shared = true;
+ } else if (!strcmp(type, "hugetlb-private")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "shmem")) {
+ map_shared = true;
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ } else if (!strcmp(type, "shmem-private")) {
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ }
+}
+
+static void parse_test_type_arg(const char *raw_type)
+{
+ set_test_type(raw_type);
+
+ if (!test_type)
+ err("failed to parse test type argument: '%s'", raw_type);
+
+ if (test_type == TEST_HUGETLB)
+ page_size = default_huge_page_size();
+ else
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ if (!page_size)
+ err("Unable to determine page size");
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+ > page_size)
+ err("Impossible to run this test");
+
+ /*
+ * Whether we can test certain features depends not just on test type,
+ * but also on whether or not this particular kernel supports the
+ * feature.
+ */
+
+ if (uffd_get_features(&features) && errno == ENOENT)
+ ksft_exit_skip("failed to get available features (%d)\n", errno);
+
+ test_uffdio_wp = test_uffdio_wp &&
+ (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+
+ if (test_type != TEST_ANON && !(features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM))
+ test_uffdio_wp = false;
+
+ close(uffd);
+ uffd = -1;
+}
+
+static void sigalrm(int sig)
+{
+ if (sig != SIGALRM)
+ abort();
+ test_uffdio_copy_eexist = true;
+ alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long nr_cpus;
+ size_t bytes;
+
+ if (argc < 4)
+ usage();
+
+ if (signal(SIGALRM, sigalrm) == SIG_ERR)
+ err("failed to arm SIGALRM");
+ alarm(ALARM_INTERVAL_SECS);
+
+ parse_test_type_arg(argv[1]);
+ bytes = atol(argv[2]) * 1024 * 1024;
+
+ if (test_type == TEST_HUGETLB &&
+ get_free_hugepages() < bytes / page_size) {
+ printf("skip: Skipping userfaultfd... not enough hugepages\n");
+ return KSFT_SKIP;
+ }
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ if (nr_cpus > 32) {
+ /* Don't let calculation below go to zero. */
+ ksft_print_msg("_SC_NPROCESSORS_ONLN (%lu) too large, capping nr_threads to 32\n",
+ nr_cpus);
+ nr_parallel = 32;
+ } else {
+ nr_parallel = nr_cpus;
+ }
+
+ nr_pages_per_cpu = bytes / page_size / nr_parallel;
+ if (!nr_pages_per_cpu) {
+ _err("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)",
+ bytes, page_size, nr_parallel);
+ usage();
+ }
+
+ bounces = atoi(argv[3]);
+ if (bounces <= 0) {
+ _err("invalid bounces");
+ usage();
+ }
+ nr_pages = nr_pages_per_cpu * nr_parallel;
+
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
new file mode 100644
index 000000000000..c73fd5d455c8
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -0,0 +1,1777 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd unit tests.
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+
+#include "uffd-common.h"
+
+#include "../../../../mm/gup_test.h"
+
+#ifdef __NR_userfaultfd
+
+/* The unit test doesn't need a large or random size, make it 32MB for now */
+#define UFFD_TEST_MEM_SIZE (32UL << 20)
+
+#define MEM_ANON BIT_ULL(0)
+#define MEM_SHMEM BIT_ULL(1)
+#define MEM_SHMEM_PRIVATE BIT_ULL(2)
+#define MEM_HUGETLB BIT_ULL(3)
+#define MEM_HUGETLB_PRIVATE BIT_ULL(4)
+
+#define MEM_ALL (MEM_ANON | MEM_SHMEM | MEM_SHMEM_PRIVATE | \
+ MEM_HUGETLB | MEM_HUGETLB_PRIVATE)
+
+#define ALIGN_UP(x, align_to) \
+ ((__typeof__(x))((((unsigned long)(x)) + ((align_to)-1)) & ~((align_to)-1)))
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+struct mem_type {
+ const char *name;
+ unsigned int mem_flag;
+ uffd_test_ops_t *mem_ops;
+ bool shared;
+};
+typedef struct mem_type mem_type_t;
+
+mem_type_t mem_types[] = {
+ {
+ .name = "anon",
+ .mem_flag = MEM_ANON,
+ .mem_ops = &anon_uffd_test_ops,
+ .shared = false,
+ },
+ {
+ .name = "shmem",
+ .mem_flag = MEM_SHMEM,
+ .mem_ops = &shmem_uffd_test_ops,
+ .shared = true,
+ },
+ {
+ .name = "shmem-private",
+ .mem_flag = MEM_SHMEM_PRIVATE,
+ .mem_ops = &shmem_uffd_test_ops,
+ .shared = false,
+ },
+ {
+ .name = "hugetlb",
+ .mem_flag = MEM_HUGETLB,
+ .mem_ops = &hugetlb_uffd_test_ops,
+ .shared = true,
+ },
+ {
+ .name = "hugetlb-private",
+ .mem_flag = MEM_HUGETLB_PRIVATE,
+ .mem_ops = &hugetlb_uffd_test_ops,
+ .shared = false,
+ },
+};
+
+/* Arguments to be passed over to each uffd unit test */
+struct uffd_test_args {
+ mem_type_t *mem_type;
+};
+typedef struct uffd_test_args uffd_test_args_t;
+
+/* Returns: UFFD_TEST_* */
+typedef void (*uffd_test_fn)(uffd_test_args_t *);
+
+typedef struct {
+ const char *name;
+ uffd_test_fn uffd_fn;
+ unsigned int mem_targets;
+ uint64_t uffd_feature_required;
+ uffd_test_case_ops_t *test_case_ops;
+} uffd_test_case_t;
+
+static void uffd_test_report(void)
+{
+ printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n",
+ ksft_get_pass_cnt(),
+ ksft_get_xskip_cnt(),
+ ksft_get_fail_cnt(),
+ ksft_test_num());
+}
+
+static void uffd_test_pass(void)
+{
+ printf("done\n");
+ ksft_inc_pass_cnt();
+}
+
+#define uffd_test_start(...) do { \
+ printf("Testing "); \
+ printf(__VA_ARGS__); \
+ printf("... "); \
+ fflush(stdout); \
+ } while (0)
+
+#define uffd_test_fail(...) do { \
+ printf("failed [reason: "); \
+ printf(__VA_ARGS__); \
+ printf("]\n"); \
+ ksft_inc_fail_cnt(); \
+ } while (0)
+
+static void uffd_test_skip(const char *message)
+{
+ printf("skipped [reason: %s]\n", message);
+ ksft_inc_xskip_cnt();
+}
+
+/*
+ * Returns 1 if specific userfaultfd supported, 0 otherwise. Note, we'll
+ * return 1 even if some test failed as long as uffd supported, because in
+ * that case we still want to proceed with the rest uffd unit tests.
+ */
+static int test_uffd_api(bool use_dev)
+{
+ struct uffdio_api uffdio_api;
+ int uffd;
+
+ uffd_test_start("UFFDIO_API (with %s)",
+ use_dev ? "/dev/userfaultfd" : "syscall");
+
+ if (use_dev)
+ uffd = uffd_open_dev(UFFD_FLAGS);
+ else
+ uffd = uffd_open_sys(UFFD_FLAGS);
+ if (uffd < 0) {
+ uffd_test_skip("cannot open userfaultfd handle");
+ return 0;
+ }
+
+ /* Test wrong UFFD_API */
+ uffdio_api.api = 0xab;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
+ uffd_test_fail("UFFDIO_API should fail with wrong api but didn't");
+ goto out;
+ }
+
+ /* Test wrong feature bit */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = BIT_ULL(63);
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
+ uffd_test_fail("UFFDIO_API should fail with wrong feature but didn't");
+ goto out;
+ }
+
+ /* Test normal UFFDIO_API */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ uffd_test_fail("UFFDIO_API should succeed but failed");
+ goto out;
+ }
+
+ /* Test double requests of UFFDIO_API with a random feature set */
+ uffdio_api.features = BIT_ULL(0);
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
+ uffd_test_fail("UFFDIO_API should reject initialized uffd");
+ goto out;
+ }
+
+ uffd_test_pass();
+out:
+ close(uffd);
+ /* We have a valid uffd handle */
+ return 1;
+}
+
+/*
+ * This function initializes the global variables. TODO: remove global
+ * vars and then remove this.
+ */
+static int
+uffd_setup_environment(uffd_test_args_t *args, uffd_test_case_t *test,
+ mem_type_t *mem_type, const char **errmsg)
+{
+ map_shared = mem_type->shared;
+ uffd_test_ops = mem_type->mem_ops;
+ uffd_test_case_ops = test->test_case_ops;
+
+ if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB))
+ page_size = default_huge_page_size();
+ else
+ page_size = psize();
+
+ /* Ensure we have at least 2 pages */
+ nr_pages = MAX(UFFD_TEST_MEM_SIZE, page_size * 2) / page_size;
+ /* TODO: remove this global var.. it's so ugly */
+ nr_parallel = 1;
+
+ /* Initialize test arguments */
+ args->mem_type = mem_type;
+
+ return uffd_test_ctx_init(test->uffd_feature_required, errmsg);
+}
+
+static bool uffd_feature_supported(uffd_test_case_t *test)
+{
+ uint64_t features;
+
+ if (uffd_get_features(&features))
+ return false;
+
+ return (features & test->uffd_feature_required) ==
+ test->uffd_feature_required;
+}
+
+static int pagemap_open(void)
+{
+ int fd = open("/proc/self/pagemap", O_RDONLY);
+
+ if (fd < 0)
+ err("open pagemap");
+
+ return fd;
+}
+
+/* This macro let __LINE__ works in err() */
+#define pagemap_check_wp(value, wp) do { \
+ if (!!(value & PM_UFFD_WP) != wp) \
+ err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
+ } while (0)
+
+typedef struct {
+ int parent_uffd, child_uffd;
+} fork_event_args;
+
+static void *fork_event_consumer(void *data)
+{
+ fork_event_args *args = data;
+ struct uffd_msg msg = { 0 };
+
+ ready_for_fork = true;
+
+ /* Read until a full msg received */
+ while (uffd_read_msg(args->parent_uffd, &msg));
+
+ if (msg.event != UFFD_EVENT_FORK)
+ err("wrong message: %u\n", msg.event);
+
+ /* Just to be properly freed later */
+ args->child_uffd = msg.arg.fork.ufd;
+ return NULL;
+}
+
+typedef struct {
+ int gup_fd;
+ bool pinned;
+} pin_args;
+
+/*
+ * Returns 0 if succeed, <0 for errors. pin_pages() needs to be paired
+ * with unpin_pages(). Currently it needs to be RO longterm pin to satisfy
+ * all needs of the test cases (e.g., trigger unshare, trigger fork() early
+ * CoW, etc.).
+ */
+static int pin_pages(pin_args *args, void *buffer, size_t size)
+{
+ struct pin_longterm_test test = {
+ .addr = (uintptr_t)buffer,
+ .size = size,
+ /* Read-only pins */
+ .flags = 0,
+ };
+
+ if (args->pinned)
+ err("already pinned");
+
+ args->gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ if (args->gup_fd < 0)
+ return -errno;
+
+ if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_START, &test)) {
+ /* Even if gup_test existed, can be an old gup_test / kernel */
+ close(args->gup_fd);
+ return -errno;
+ }
+ args->pinned = true;
+ return 0;
+}
+
+static void unpin_pages(pin_args *args)
+{
+ if (!args->pinned)
+ err("unpin without pin first");
+ if (ioctl(args->gup_fd, PIN_LONGTERM_TEST_STOP))
+ err("PIN_LONGTERM_TEST_STOP");
+ close(args->gup_fd);
+ args->pinned = false;
+}
+
+static int pagemap_test_fork(int uffd, bool with_event, bool test_pin)
+{
+ fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 };
+ pthread_t thread;
+ pid_t child;
+ uint64_t value;
+ int fd, result;
+
+ /* Prepare a thread to resolve EVENT_FORK */
+ if (with_event) {
+ ready_for_fork = false;
+ if (pthread_create(&thread, NULL, fork_event_consumer, &args))
+ err("pthread_create()");
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
+ }
+
+ child = fork();
+ if (!child) {
+ /* Open the pagemap fd of the child itself */
+ pin_args args = {};
+
+ fd = pagemap_open();
+
+ if (test_pin && pin_pages(&args, area_dst, page_size))
+ /*
+ * Normally when reach here we have pinned in
+ * previous tests, so shouldn't fail anymore
+ */
+ err("pin page failed in child");
+
+ value = pagemap_get_entry(fd, area_dst);
+ /*
+ * After fork(), we should handle uffd-wp bit differently:
+ *
+ * (1) when with EVENT_FORK, it should persist
+ * (2) when without EVENT_FORK, it should be dropped
+ */
+ pagemap_check_wp(value, with_event);
+ if (test_pin)
+ unpin_pages(&args);
+ /* Succeed */
+ exit(0);
+ }
+ waitpid(child, &result, 0);
+
+ if (with_event) {
+ if (pthread_join(thread, NULL))
+ err("pthread_join()");
+ if (args.child_uffd < 0)
+ err("Didn't receive child uffd");
+ close(args.child_uffd);
+ }
+
+ return result;
+}
+
+static void uffd_wp_unpopulated_test(uffd_test_args_t *args)
+{
+ uint64_t value;
+ int pagemap_fd;
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ false, true, false))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Test applying pte marker to anon unpopulated */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+
+ /* Test unprotect on anon pte marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, false);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Test zap on anon marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Test fault in after marker removed */
+ *area_dst = 1;
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+ /* Drop it to make pte none again */
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+
+ /* Test read-zero-page upon pte marker */
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ *(volatile char *)area_dst;
+ /* Drop it to make pte none again */
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+
+ uffd_test_pass();
+}
+
+static void uffd_wp_fork_test_common(uffd_test_args_t *args,
+ bool with_event)
+{
+ int pagemap_fd;
+ uint64_t value;
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ false, true, false))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Touch the page */
+ *area_dst = 1;
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+ if (pagemap_test_fork(uffd, with_event, false)) {
+ uffd_test_fail("Detected %s uffd-wp bit in child in present pte",
+ with_event ? "missing" : "stall");
+ goto out;
+ }
+
+ /*
+ * This is an attempt for zapping the pgtable so as to test the
+ * markers.
+ *
+ * For private mappings, PAGEOUT will only work on exclusive ptes
+ * (PM_MMAP_EXCLUSIVE) which we should satisfy.
+ *
+ * For shared, PAGEOUT may not work. Use DONTNEED instead which
+ * plays a similar role of zapping (rather than freeing the page)
+ * to expose pte markers.
+ */
+ if (args->mem_type->shared) {
+ if (madvise(area_dst, page_size, MADV_DONTNEED))
+ err("MADV_DONTNEED");
+ } else {
+ /*
+ * NOTE: ignore retval because private-hugetlb doesn't yet
+ * support swapping, so it could fail.
+ */
+ madvise(area_dst, page_size, MADV_PAGEOUT);
+ }
+
+ /* Uffd-wp should persist even swapped out */
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, true);
+ if (pagemap_test_fork(uffd, with_event, false)) {
+ uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte",
+ with_event ? "missing" : "stall");
+ goto out;
+ }
+
+ /* Unprotect; this tests swap pte modifications */
+ wp_range(uffd, (uint64_t)area_dst, page_size, false);
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+
+ /* Fault in the page from disk */
+ *area_dst = 2;
+ value = pagemap_get_entry(pagemap_fd, area_dst);
+ pagemap_check_wp(value, false);
+ uffd_test_pass();
+out:
+ if (uffd_unregister(uffd, area_dst, nr_pages * page_size))
+ err("unregister failed");
+ close(pagemap_fd);
+}
+
+static void uffd_wp_fork_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_test_common(args, false);
+}
+
+static void uffd_wp_fork_with_event_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_test_common(args, true);
+}
+
+static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args,
+ bool with_event)
+{
+ int pagemap_fd;
+ pin_args pin_args = {};
+
+ if (uffd_register(uffd, area_dst, page_size, false, true, false))
+ err("register failed");
+
+ pagemap_fd = pagemap_open();
+
+ /* Touch the page */
+ *area_dst = 1;
+ wp_range(uffd, (uint64_t)area_dst, page_size, true);
+
+ /*
+ * 1. First pin, then fork(). This tests fork() special path when
+ * doing early CoW if the page is private.
+ */
+ if (pin_pages(&pin_args, area_dst, page_size)) {
+ uffd_test_skip("Possibly CONFIG_GUP_TEST missing "
+ "or unprivileged");
+ close(pagemap_fd);
+ uffd_unregister(uffd, area_dst, page_size);
+ return;
+ }
+
+ if (pagemap_test_fork(uffd, with_event, false)) {
+ uffd_test_fail("Detected %s uffd-wp bit in early CoW of fork()",
+ with_event ? "missing" : "stall");
+ unpin_pages(&pin_args);
+ goto out;
+ }
+
+ unpin_pages(&pin_args);
+
+ /*
+ * 2. First fork(), then pin (in the child, where test_pin==true).
+ * This tests COR, aka, page unsharing on private memories.
+ */
+ if (pagemap_test_fork(uffd, with_event, true)) {
+ uffd_test_fail("Detected %s uffd-wp bit when RO pin",
+ with_event ? "missing" : "stall");
+ goto out;
+ }
+ uffd_test_pass();
+out:
+ if (uffd_unregister(uffd, area_dst, page_size))
+ err("register failed");
+ close(pagemap_fd);
+}
+
+static void uffd_wp_fork_pin_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_pin_test_common(args, false);
+}
+
+static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args)
+{
+ uffd_wp_fork_pin_test_common(args, true);
+}
+
+static void check_memory_contents(char *p)
+{
+ unsigned long i, j;
+ uint8_t expected_byte;
+
+ for (i = 0; i < nr_pages; ++i) {
+ expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
+ for (j = 0; j < page_size; j++) {
+ uint8_t v = *(uint8_t *)(p + (i * page_size) + j);
+ if (v != expected_byte)
+ err("unexpected page contents");
+ }
+ }
+}
+
+static void uffd_minor_test_common(bool test_collapse, bool test_wp)
+{
+ unsigned long p;
+ pthread_t uffd_mon;
+ char c;
+ struct uffd_args args = { 0 };
+
+ /*
+ * NOTE: MADV_COLLAPSE is not yet compatible with WP, so testing
+ * both do not make much sense.
+ */
+ assert(!(test_collapse && test_wp));
+
+ if (uffd_register(uffd, area_dst_alias, nr_pages * page_size,
+ /* NOTE! MADV_COLLAPSE may not work with uffd-wp */
+ false, test_wp, true))
+ err("register failure");
+
+ /*
+ * After registering with UFFD, populate the non-UFFD-registered side of
+ * the shared mapping. This should *not* trigger any UFFD minor faults.
+ */
+ for (p = 0; p < nr_pages; ++p)
+ memset(area_dst + (p * page_size), p % ((uint8_t)-1),
+ page_size);
+
+ args.apply_wp = test_wp;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ /*
+ * Read each of the pages back using the UFFD-registered mapping. We
+ * expect that the first time we touch a page, it will result in a minor
+ * fault. uffd_poll_thread will resolve the fault by bit-flipping the
+ * page's contents, and then issuing a CONTINUE ioctl.
+ */
+ check_memory_contents(area_dst_alias);
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("join() failed");
+
+ if (test_collapse) {
+ if (madvise(area_dst_alias, nr_pages * page_size,
+ MADV_COLLAPSE)) {
+ /* It's fine to fail for this one... */
+ uffd_test_skip("MADV_COLLAPSE failed");
+ return;
+ }
+
+ uffd_test_ops->check_pmd_mapping(area_dst,
+ nr_pages * page_size /
+ read_pmd_pagesize());
+ /*
+ * This won't cause uffd-fault - it purely just makes sure there
+ * was no corruption.
+ */
+ check_memory_contents(area_dst_alias);
+ }
+
+ if (args.missing_faults != 0 || args.minor_faults != nr_pages)
+ uffd_test_fail("stats check error");
+ else
+ uffd_test_pass();
+}
+
+void uffd_minor_test(uffd_test_args_t *args)
+{
+ uffd_minor_test_common(false, false);
+}
+
+void uffd_minor_wp_test(uffd_test_args_t *args)
+{
+ uffd_minor_test_common(false, true);
+}
+
+void uffd_minor_collapse_test(uffd_test_args_t *args)
+{
+ uffd_minor_test_common(true, false);
+}
+
+static sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+ if (sig == SIGBUS) {
+ if (sigbuf)
+ siglongjmp(*sigbuf, 1);
+ abort();
+ }
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test, bool wp)
+{
+ unsigned long nr, i;
+ unsigned long long count;
+ unsigned long split_nr_pages;
+ unsigned long lastnr;
+ struct sigaction act;
+ volatile unsigned long signalled = 0;
+
+ split_nr_pages = (nr_pages + 1) / 2;
+
+ if (signal_test) {
+ sigbuf = &jbuf;
+ memset(&act, 0, sizeof(act));
+ act.sa_sigaction = sighndl;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &act, 0))
+ err("sigaction");
+ lastnr = (unsigned long)-1;
+ }
+
+ for (nr = 0; nr < split_nr_pages; nr++) {
+ volatile int steps = 1;
+ unsigned long offset = nr * page_size;
+
+ if (signal_test) {
+ if (sigsetjmp(*sigbuf, 1) != 0) {
+ if (steps == 1 && nr == lastnr)
+ err("Signal repeated");
+
+ lastnr = nr;
+ if (signal_test == 1) {
+ if (steps == 1) {
+ /* This is a MISSING request */
+ steps++;
+ if (copy_page(uffd, offset, wp))
+ signalled++;
+ } else {
+ /* This is a WP request */
+ assert(steps == 2);
+ wp_range(uffd,
+ (__u64)area_dst +
+ offset,
+ page_size, false);
+ }
+ } else {
+ signalled++;
+ continue;
+ }
+ }
+ }
+
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr])
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[nr]);
+ /*
+ * Trigger write protection if there is by writing
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
+ }
+
+ if (signal_test)
+ return signalled != split_nr_pages;
+
+ area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+ if (area_dst == MAP_FAILED)
+ err("mremap");
+ /* Reset area_src since we just clobbered it */
+ area_src = NULL;
+
+ for (; nr < nr_pages; nr++) {
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr]) {
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[nr]);
+ }
+ /*
+ * Trigger write protection if there is by writing
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
+ }
+
+ uffd_test_ops->release_pages(area_dst);
+
+ for (nr = 0; nr < nr_pages; nr++)
+ for (i = 0; i < page_size; i++)
+ if (*(area_dst + nr * page_size + i) != 0)
+ err("page %lu offset %lu is not zero", nr, i);
+
+ return 0;
+}
+
+static void uffd_sigbus_test_common(bool wp)
+{
+ unsigned long userfaults;
+ pthread_t uffd_mon;
+ pid_t pid;
+ int err;
+ char c;
+ struct uffd_args args = { 0 };
+
+ ready_for_fork = false;
+
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, wp, false))
+ err("register failure");
+
+ if (faulting_process(1, wp))
+ err("faulting process failed");
+
+ uffd_test_ops->release_pages(area_dst);
+
+ args.apply_wp = wp;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
+
+ pid = fork();
+ if (pid < 0)
+ err("fork");
+
+ if (!pid)
+ exit(faulting_process(2, wp));
+
+ waitpid(pid, &err, 0);
+ if (err)
+ err("faulting process failed");
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, (void **)&userfaults))
+ err("pthread_join()");
+
+ if (userfaults)
+ uffd_test_fail("Signal test failed, userfaults: %ld", userfaults);
+ else
+ uffd_test_pass();
+}
+
+static void uffd_sigbus_test(uffd_test_args_t *args)
+{
+ uffd_sigbus_test_common(false);
+}
+
+static void uffd_sigbus_wp_test(uffd_test_args_t *args)
+{
+ uffd_sigbus_test_common(true);
+}
+
+static void uffd_events_test_common(bool wp)
+{
+ pthread_t uffd_mon;
+ pid_t pid;
+ int err;
+ char c;
+ struct uffd_args args = { 0 };
+
+ ready_for_fork = false;
+
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, wp, false))
+ err("register failure");
+
+ args.apply_wp = wp;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
+
+ pid = fork();
+ if (pid < 0)
+ err("fork");
+
+ if (!pid)
+ exit(faulting_process(0, wp));
+
+ waitpid(pid, &err, 0);
+ if (err)
+ err("faulting process failed");
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("pthread_join()");
+
+ if (args.missing_faults != nr_pages)
+ uffd_test_fail("Fault counts wrong");
+ else
+ uffd_test_pass();
+}
+
+static void uffd_events_test(uffd_test_args_t *args)
+{
+ uffd_events_test_common(false);
+}
+
+static void uffd_events_wp_test(uffd_test_args_t *args)
+{
+ uffd_events_test_common(true);
+}
+
+static void retry_uffdio_zeropage(int ufd,
+ struct uffdio_zeropage *uffdio_zeropage)
+{
+ uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+ uffdio_zeropage->range.len,
+ 0);
+ if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+ if (uffdio_zeropage->zeropage != -EEXIST)
+ err("UFFDIO_ZEROPAGE error: %"PRId64,
+ (int64_t)uffdio_zeropage->zeropage);
+ } else {
+ err("UFFDIO_ZEROPAGE error: %"PRId64,
+ (int64_t)uffdio_zeropage->zeropage);
+ }
+}
+
+static bool do_uffdio_zeropage(int ufd, bool has_zeropage)
+{
+ struct uffdio_zeropage uffdio_zeropage = { 0 };
+ int ret;
+ __s64 res;
+
+ uffdio_zeropage.range.start = (unsigned long) area_dst;
+ uffdio_zeropage.range.len = page_size;
+ uffdio_zeropage.mode = 0;
+ ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+ res = uffdio_zeropage.zeropage;
+ if (ret) {
+ /* real retval in ufdio_zeropage.zeropage */
+ if (has_zeropage)
+ err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
+ else if (res != -EINVAL)
+ err("UFFDIO_ZEROPAGE not -EINVAL");
+ } else if (has_zeropage) {
+ if (res != page_size)
+ err("UFFDIO_ZEROPAGE unexpected size");
+ else
+ retry_uffdio_zeropage(ufd, &uffdio_zeropage);
+ return true;
+ } else
+ err("UFFDIO_ZEROPAGE succeeded");
+
+ return false;
+}
+
+/*
+ * Registers a range with MISSING mode only for zeropage test. Return true
+ * if UFFDIO_ZEROPAGE supported, false otherwise. Can't use uffd_register()
+ * because we want to detect .ioctls along the way.
+ */
+static bool
+uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len)
+{
+ uint64_t ioctls = 0;
+
+ if (uffd_register_with_ioctls(uffd, addr, len, true,
+ false, false, &ioctls))
+ err("zeropage register fail");
+
+ return ioctls & (1 << _UFFDIO_ZEROPAGE);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static void uffd_zeropage_test(uffd_test_args_t *args)
+{
+ bool has_zeropage;
+ int i;
+
+ has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size);
+ if (area_dst_alias)
+ /* Ignore the retval; we already have it */
+ uffd_register_detect_zeropage(uffd, area_dst_alias, page_size);
+
+ if (do_uffdio_zeropage(uffd, has_zeropage))
+ for (i = 0; i < page_size; i++)
+ if (area_dst[i] != 0)
+ err("data non-zero at offset %d\n", i);
+
+ if (uffd_unregister(uffd, area_dst, page_size))
+ err("unregister");
+
+ if (area_dst_alias && uffd_unregister(uffd, area_dst_alias, page_size))
+ err("unregister");
+
+ uffd_test_pass();
+}
+
+static void uffd_register_poison(int uffd, void *addr, uint64_t len)
+{
+ uint64_t ioctls = 0;
+ uint64_t expected = (1 << _UFFDIO_COPY) | (1 << _UFFDIO_POISON);
+
+ if (uffd_register_with_ioctls(uffd, addr, len, true,
+ false, false, &ioctls))
+ err("poison register fail");
+
+ if ((ioctls & expected) != expected)
+ err("registered area doesn't support COPY and POISON ioctls");
+}
+
+static void do_uffdio_poison(int uffd, unsigned long offset)
+{
+ struct uffdio_poison uffdio_poison = { 0 };
+ int ret;
+ __s64 res;
+
+ uffdio_poison.range.start = (unsigned long) area_dst + offset;
+ uffdio_poison.range.len = page_size;
+ uffdio_poison.mode = 0;
+ ret = ioctl(uffd, UFFDIO_POISON, &uffdio_poison);
+ res = uffdio_poison.updated;
+
+ if (ret)
+ err("UFFDIO_POISON error: %"PRId64, (int64_t)res);
+ else if (res != page_size)
+ err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res);
+}
+
+static void uffd_poison_handle_fault(
+ struct uffd_msg *msg, struct uffd_args *args)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
+
+ if (msg->arg.pagefault.flags &
+ (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR))
+ err("unexpected fault type %llu", msg->arg.pagefault.flags);
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ /* Odd pages -> copy zeroed page; even pages -> poison. */
+ if (offset & page_size)
+ copy_page(uffd, offset, false);
+ else
+ do_uffdio_poison(uffd, offset);
+}
+
+static void uffd_poison_test(uffd_test_args_t *targs)
+{
+ pthread_t uffd_mon;
+ char c;
+ struct uffd_args args = { 0 };
+ struct sigaction act = { 0 };
+ unsigned long nr_sigbus = 0;
+ unsigned long nr;
+
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ uffd_register_poison(uffd, area_dst, nr_pages * page_size);
+ memset(area_src, 0, nr_pages * page_size);
+
+ args.handle_fault = uffd_poison_handle_fault;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ sigbuf = &jbuf;
+ act.sa_sigaction = sighndl;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &act, 0))
+ err("sigaction");
+
+ for (nr = 0; nr < nr_pages; ++nr) {
+ unsigned long offset = nr * page_size;
+ const char *bytes = (const char *) area_dst + offset;
+ const char *i;
+
+ if (sigsetjmp(*sigbuf, 1)) {
+ /*
+ * Access below triggered a SIGBUS, which was caught by
+ * sighndl, which then jumped here. Count this SIGBUS,
+ * and move on to next page.
+ */
+ ++nr_sigbus;
+ continue;
+ }
+
+ for (i = bytes; i < bytes + page_size; ++i) {
+ if (*i)
+ err("nonzero byte in area_dst (%p) at %p: %u",
+ area_dst, i, *i);
+ }
+ }
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("pthread_join()");
+
+ if (nr_sigbus != nr_pages / 2)
+ err("expected to receive %lu SIGBUS, actually received %lu",
+ nr_pages / 2, nr_sigbus);
+
+ uffd_test_pass();
+}
+
+static void
+uffd_move_handle_fault_common(struct uffd_msg *msg, struct uffd_args *args,
+ unsigned long len)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
+
+ if (msg->arg.pagefault.flags &
+ (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR | UFFD_PAGEFAULT_FLAG_WRITE))
+ err("unexpected fault type %llu", msg->arg.pagefault.flags);
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(len-1);
+
+ if (move_page(uffd, offset, len))
+ args->missing_faults++;
+}
+
+static void uffd_move_handle_fault(struct uffd_msg *msg,
+ struct uffd_args *args)
+{
+ uffd_move_handle_fault_common(msg, args, page_size);
+}
+
+static void uffd_move_pmd_handle_fault(struct uffd_msg *msg,
+ struct uffd_args *args)
+{
+ uffd_move_handle_fault_common(msg, args, read_pmd_pagesize());
+}
+
+static void
+uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size,
+ void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args))
+{
+ unsigned long nr;
+ pthread_t uffd_mon;
+ char c;
+ unsigned long long count;
+ struct uffd_args args = { 0 };
+ char *orig_area_src = NULL, *orig_area_dst = NULL;
+ unsigned long step_size, step_count;
+ unsigned long src_offs = 0;
+ unsigned long dst_offs = 0;
+
+ /* Prevent source pages from being mapped more than once */
+ if (madvise(area_src, nr_pages * page_size, MADV_DONTFORK))
+ err("madvise(MADV_DONTFORK) failure");
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, false, false))
+ err("register failure");
+
+ args.handle_fault = handle_fault;
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
+ err("uffd_poll_thread create");
+
+ step_size = chunk_size / page_size;
+ step_count = nr_pages / step_size;
+
+ if (chunk_size > page_size) {
+ char *aligned_src = ALIGN_UP(area_src, chunk_size);
+ char *aligned_dst = ALIGN_UP(area_dst, chunk_size);
+
+ if (aligned_src != area_src || aligned_dst != area_dst) {
+ src_offs = (aligned_src - area_src) / page_size;
+ dst_offs = (aligned_dst - area_dst) / page_size;
+ step_count--;
+ }
+ orig_area_src = area_src;
+ orig_area_dst = area_dst;
+ area_src = aligned_src;
+ area_dst = aligned_dst;
+ }
+
+ /*
+ * Read each of the pages back using the UFFD-registered mapping. We
+ * expect that the first time we touch a page, it will result in a missing
+ * fault. uffd_poll_thread will resolve the fault by moving source
+ * page to destination.
+ */
+ for (nr = 0; nr < step_count * step_size; nr += step_size) {
+ unsigned long i;
+
+ /* Check area_src content */
+ for (i = 0; i < step_size; i++) {
+ count = *area_count(area_src, nr + i);
+ if (count != count_verify[src_offs + nr + i])
+ err("nr %lu source memory invalid %llu %llu\n",
+ nr + i, count, count_verify[src_offs + nr + i]);
+ }
+
+ /* Faulting into area_dst should move the page or the huge page */
+ for (i = 0; i < step_size; i++) {
+ count = *area_count(area_dst, nr + i);
+ if (count != count_verify[dst_offs + nr + i])
+ err("nr %lu memory corruption %llu %llu\n",
+ nr, count, count_verify[dst_offs + nr + i]);
+ }
+
+ /* Re-check area_src content which should be empty */
+ for (i = 0; i < step_size; i++) {
+ count = *area_count(area_src, nr + i);
+ if (count != 0)
+ err("nr %lu move failed %llu %llu\n",
+ nr, count, count_verify[src_offs + nr + i]);
+ }
+ }
+ if (chunk_size > page_size) {
+ area_src = orig_area_src;
+ area_dst = orig_area_dst;
+ }
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("join() failed");
+
+ if (args.missing_faults != step_count || args.minor_faults != 0)
+ uffd_test_fail("stats check error");
+ else
+ uffd_test_pass();
+}
+
+static void uffd_move_test(uffd_test_args_t *targs)
+{
+ uffd_move_test_common(targs, page_size, uffd_move_handle_fault);
+}
+
+static void uffd_move_pmd_test(uffd_test_args_t *targs)
+{
+ if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
+ err("madvise(MADV_HUGEPAGE) failure");
+ uffd_move_test_common(targs, read_pmd_pagesize(),
+ uffd_move_pmd_handle_fault);
+}
+
+static void uffd_move_pmd_split_test(uffd_test_args_t *targs)
+{
+ if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
+ err("madvise(MADV_NOHUGEPAGE) failure");
+ uffd_move_test_common(targs, read_pmd_pagesize(),
+ uffd_move_pmd_handle_fault);
+}
+
+static bool
+uffdio_verify_results(const char *name, int ret, int error, long result)
+{
+ /*
+ * Should always return -1 with errno=EAGAIN, with corresponding
+ * result field updated in ioctl() args to be -EAGAIN too
+ * (e.g. copy.copy field for UFFDIO_COPY).
+ */
+ if (ret != -1) {
+ uffd_test_fail("%s should have returned -1", name);
+ return false;
+ }
+
+ if (error != EAGAIN) {
+ uffd_test_fail("%s should have errno==EAGAIN", name);
+ return false;
+ }
+
+ if (result != -EAGAIN) {
+ uffd_test_fail("%s should have been updated for -EAGAIN",
+ name);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * This defines a function to test one ioctl. Note that here "field" can
+ * be 1 or anything not -EAGAIN. With that initial value set, we can
+ * verify later that it should be updated by kernel (when -EAGAIN
+ * returned), by checking whether it is also updated to -EAGAIN.
+ */
+#define DEFINE_MMAP_CHANGING_TEST(name, ioctl_name, field) \
+ static bool uffdio_mmap_changing_test_##name(int fd) \
+ { \
+ int ret; \
+ struct uffdio_##name args = { \
+ .field = 1, \
+ }; \
+ ret = ioctl(fd, ioctl_name, &args); \
+ return uffdio_verify_results(#ioctl_name, ret, errno, args.field); \
+ }
+
+DEFINE_MMAP_CHANGING_TEST(zeropage, UFFDIO_ZEROPAGE, zeropage)
+DEFINE_MMAP_CHANGING_TEST(copy, UFFDIO_COPY, copy)
+DEFINE_MMAP_CHANGING_TEST(move, UFFDIO_MOVE, move)
+DEFINE_MMAP_CHANGING_TEST(poison, UFFDIO_POISON, updated)
+DEFINE_MMAP_CHANGING_TEST(continue, UFFDIO_CONTINUE, mapped)
+
+typedef enum {
+ /* We actually do not care about any state except UNINTERRUPTIBLE.. */
+ THR_STATE_UNKNOWN = 0,
+ THR_STATE_UNINTERRUPTIBLE,
+} thread_state;
+
+static void sleep_short(void)
+{
+ usleep(1000);
+}
+
+static thread_state thread_state_get(pid_t tid)
+{
+ const char *header = "State:\t";
+ char tmp[256], *p, c;
+ FILE *fp;
+
+ snprintf(tmp, sizeof(tmp), "/proc/%d/status", tid);
+ fp = fopen(tmp, "r");
+
+ if (!fp)
+ return THR_STATE_UNKNOWN;
+
+ while (fgets(tmp, sizeof(tmp), fp)) {
+ p = strstr(tmp, header);
+ if (p) {
+ /* For example, "State:\tD (disk sleep)" */
+ c = *(p + sizeof(header) - 1);
+ return c == 'D' ?
+ THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN;
+ }
+ }
+
+ return THR_STATE_UNKNOWN;
+}
+
+static void thread_state_until(pid_t tid, thread_state state)
+{
+ thread_state s;
+
+ do {
+ s = thread_state_get(tid);
+ sleep_short();
+ } while (s != state);
+}
+
+static void *uffd_mmap_changing_thread(void *opaque)
+{
+ volatile pid_t *pid = opaque;
+ int ret;
+
+ /* Unfortunately, it's only fetch-able from the thread itself.. */
+ assert(*pid == 0);
+ *pid = syscall(SYS_gettid);
+
+ /* Inject an event, this will hang solid until the event read */
+ ret = madvise(area_dst, page_size, MADV_REMOVE);
+ if (ret)
+ err("madvise(MADV_REMOVE) failed");
+
+ return NULL;
+}
+
+static void uffd_consume_message(int fd)
+{
+ struct uffd_msg msg = { 0 };
+
+ while (uffd_read_msg(fd, &msg));
+}
+
+static void uffd_mmap_changing_test(uffd_test_args_t *targs)
+{
+ /*
+ * This stores the real PID (which can be different from how tid is
+ * defined..) for the child thread, 0 means not initialized.
+ */
+ pid_t pid = 0;
+ pthread_t tid;
+ int ret;
+
+ if (uffd_register(uffd, area_dst, nr_pages * page_size,
+ true, false, false))
+ err("uffd_register() failed");
+
+ /* Create a thread to generate the racy event */
+ ret = pthread_create(&tid, NULL, uffd_mmap_changing_thread, &pid);
+ if (ret)
+ err("pthread_create() failed");
+
+ /*
+ * Wait until the thread setup the pid. Use volatile to make sure
+ * it reads from RAM not regs.
+ */
+ while (!(volatile pid_t)pid)
+ sleep_short();
+
+ /* Wait until the thread hangs at REMOVE event */
+ thread_state_until(pid, THR_STATE_UNINTERRUPTIBLE);
+
+ if (!uffdio_mmap_changing_test_copy(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_zeropage(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_move(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_poison(uffd))
+ return;
+
+ if (!uffdio_mmap_changing_test_continue(uffd))
+ return;
+
+ /*
+ * All succeeded above! Recycle everything. Start by reading the
+ * event so as to kick the thread roll again..
+ */
+ uffd_consume_message(uffd);
+
+ ret = pthread_join(tid, NULL);
+ assert(ret == 0);
+
+ uffd_test_pass();
+}
+
+static int prevent_hugepages(const char **errmsg)
+{
+ /* This should be done before source area is populated */
+ if (madvise(area_src, nr_pages * page_size, MADV_NOHUGEPAGE)) {
+ /* Ignore only if CONFIG_TRANSPARENT_HUGEPAGE=n */
+ if (errno != EINVAL) {
+ if (errmsg)
+ *errmsg = "madvise(MADV_NOHUGEPAGE) failed";
+ return -errno;
+ }
+ }
+ return 0;
+}
+
+static int request_hugepages(const char **errmsg)
+{
+ /* This should be done before source area is populated */
+ if (madvise(area_src, nr_pages * page_size, MADV_HUGEPAGE)) {
+ if (errmsg) {
+ *errmsg = (errno == EINVAL) ?
+ "CONFIG_TRANSPARENT_HUGEPAGE is not set" :
+ "madvise(MADV_HUGEPAGE) failed";
+ }
+ return -errno;
+ }
+ return 0;
+}
+
+struct uffd_test_case_ops uffd_move_test_case_ops = {
+ .post_alloc = prevent_hugepages,
+};
+
+struct uffd_test_case_ops uffd_move_test_pmd_case_ops = {
+ .post_alloc = request_hugepages,
+};
+
+/*
+ * Test the returned uffdio_register.ioctls with different register modes.
+ * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test.
+ */
+static void
+do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor)
+{
+ uint64_t ioctls = 0, expected = BIT_ULL(_UFFDIO_WAKE);
+ mem_type_t *mem_type = args->mem_type;
+ int ret;
+
+ ret = uffd_register_with_ioctls(uffd, area_dst, page_size,
+ miss, wp, minor, &ioctls);
+
+ /*
+ * Handle special cases of UFFDIO_REGISTER here where it should
+ * just fail with -EINVAL first..
+ *
+ * Case 1: register MINOR on anon
+ * Case 2: register with no mode selected
+ */
+ if ((minor && (mem_type->mem_flag == MEM_ANON)) ||
+ (!miss && !wp && !minor)) {
+ if (ret != -EINVAL)
+ err("register (miss=%d, wp=%d, minor=%d) failed "
+ "with wrong errno=%d", miss, wp, minor, ret);
+ return;
+ }
+
+ /* UFFDIO_REGISTER should succeed, then check ioctls returned */
+ if (miss)
+ expected |= BIT_ULL(_UFFDIO_COPY);
+ if (wp)
+ expected |= BIT_ULL(_UFFDIO_WRITEPROTECT);
+ if (minor)
+ expected |= BIT_ULL(_UFFDIO_CONTINUE);
+
+ if ((ioctls & expected) != expected)
+ err("unexpected uffdio_register.ioctls "
+ "(miss=%d, wp=%d, minor=%d): expected=0x%"PRIx64", "
+ "returned=0x%"PRIx64, miss, wp, minor, expected, ioctls);
+
+ if (uffd_unregister(uffd, area_dst, page_size))
+ err("unregister");
+}
+
+static void uffd_register_ioctls_test(uffd_test_args_t *args)
+{
+ int miss, wp, minor;
+
+ for (miss = 0; miss <= 1; miss++)
+ for (wp = 0; wp <= 1; wp++)
+ for (minor = 0; minor <= 1; minor++)
+ do_register_ioctls_test(args, miss, wp, minor);
+
+ uffd_test_pass();
+}
+
+uffd_test_case_t uffd_tests[] = {
+ {
+ /* Test returned uffdio_register.ioctls. */
+ .name = "register-ioctls",
+ .uffd_fn = uffd_register_ioctls_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_MISSING_HUGETLBFS |
+ UFFD_FEATURE_MISSING_SHMEM |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ UFFD_FEATURE_MINOR_HUGETLBFS |
+ UFFD_FEATURE_MINOR_SHMEM,
+ },
+ {
+ .name = "zeropage",
+ .uffd_fn = uffd_zeropage_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = 0,
+ },
+ {
+ .name = "move",
+ .uffd_fn = uffd_move_test,
+ .mem_targets = MEM_ANON,
+ .uffd_feature_required = UFFD_FEATURE_MOVE,
+ .test_case_ops = &uffd_move_test_case_ops,
+ },
+ {
+ .name = "move-pmd",
+ .uffd_fn = uffd_move_pmd_test,
+ .mem_targets = MEM_ANON,
+ .uffd_feature_required = UFFD_FEATURE_MOVE,
+ .test_case_ops = &uffd_move_test_pmd_case_ops,
+ },
+ {
+ .name = "move-pmd-split",
+ .uffd_fn = uffd_move_pmd_split_test,
+ .mem_targets = MEM_ANON,
+ .uffd_feature_required = UFFD_FEATURE_MOVE,
+ .test_case_ops = &uffd_move_test_pmd_case_ops,
+ },
+ {
+ .name = "wp-fork",
+ .uffd_fn = uffd_wp_fork_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+ {
+ .name = "wp-fork-with-event",
+ .uffd_fn = uffd_wp_fork_with_event_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ /* when set, child process should inherit uffd-wp bits */
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "wp-fork-pin",
+ .uffd_fn = uffd_wp_fork_pin_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+ {
+ .name = "wp-fork-pin-with-event",
+ .uffd_fn = uffd_wp_fork_pin_with_event_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ /* when set, child process should inherit uffd-wp bits */
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "wp-unpopulated",
+ .uffd_fn = uffd_wp_unpopulated_test,
+ .mem_targets = MEM_ANON,
+ .uffd_feature_required =
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_UNPOPULATED,
+ },
+ {
+ .name = "minor",
+ .uffd_fn = uffd_minor_test,
+ .mem_targets = MEM_SHMEM | MEM_HUGETLB,
+ .uffd_feature_required =
+ UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM,
+ },
+ {
+ .name = "minor-wp",
+ .uffd_fn = uffd_minor_wp_test,
+ .mem_targets = MEM_SHMEM | MEM_HUGETLB,
+ .uffd_feature_required =
+ UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ /*
+ * HACK: here we leveraged WP_UNPOPULATED to detect whether
+ * minor mode supports wr-protect. There's no feature flag
+ * for it so this is the best we can test against.
+ */
+ UFFD_FEATURE_WP_UNPOPULATED,
+ },
+ {
+ .name = "minor-collapse",
+ .uffd_fn = uffd_minor_collapse_test,
+ /* MADV_COLLAPSE only works with shmem */
+ .mem_targets = MEM_SHMEM,
+ /* We can't test MADV_COLLAPSE, so try our luck */
+ .uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM,
+ },
+ {
+ .name = "sigbus",
+ .uffd_fn = uffd_sigbus_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_SIGBUS |
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "sigbus-wp",
+ .uffd_fn = uffd_sigbus_wp_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_SIGBUS |
+ UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+ {
+ .name = "events",
+ .uffd_fn = uffd_events_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_EVENT_FORK |
+ UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE,
+ },
+ {
+ .name = "events-wp",
+ .uffd_fn = uffd_events_wp_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_EVENT_FORK |
+ UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
+ {
+ .name = "poison",
+ .uffd_fn = uffd_poison_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_POISON,
+ },
+ {
+ .name = "mmap-changing",
+ .uffd_fn = uffd_mmap_changing_test,
+ /*
+ * There's no point running this test over all mem types as
+ * they share the same code paths.
+ *
+ * Choose shmem for simplicity, because (1) shmem supports
+ * MINOR mode to cover UFFDIO_CONTINUE, and (2) shmem is
+ * almost always available (unlike hugetlb). Here we
+ * abused SHMEM for UFFDIO_MOVE, but the test we want to
+ * cover doesn't yet need the correct memory type..
+ */
+ .mem_targets = MEM_SHMEM,
+ /*
+ * Any UFFD_FEATURE_EVENT_* should work to trigger the
+ * race logically, but choose the simplest (REMOVE).
+ *
+ * Meanwhile, since we'll cover quite a few new ioctl()s
+ * (CONTINUE, POISON, MOVE), skip this test for old kernels
+ * by choosing all of them.
+ */
+ .uffd_feature_required = UFFD_FEATURE_EVENT_REMOVE |
+ UFFD_FEATURE_MOVE | UFFD_FEATURE_POISON |
+ UFFD_FEATURE_MINOR_SHMEM,
+ },
+};
+
+static void usage(const char *prog)
+{
+ printf("usage: %s [-f TESTNAME]\n", prog);
+ puts("");
+ puts(" -f: test name to filter (e.g., event)");
+ puts(" -h: show the help msg");
+ puts(" -l: list tests only");
+ puts("");
+ exit(KSFT_FAIL);
+}
+
+int main(int argc, char *argv[])
+{
+ int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t);
+ int n_mems = sizeof(mem_types) / sizeof(mem_type_t);
+ const char *test_filter = NULL;
+ bool list_only = false;
+ uffd_test_case_t *test;
+ mem_type_t *mem_type;
+ uffd_test_args_t args;
+ const char *errmsg;
+ int has_uffd, opt;
+ int i, j;
+
+ while ((opt = getopt(argc, argv, "f:hl")) != -1) {
+ switch (opt) {
+ case 'f':
+ test_filter = optarg;
+ break;
+ case 'l':
+ list_only = true;
+ break;
+ case 'h':
+ default:
+ /* Unknown */
+ usage(argv[0]);
+ break;
+ }
+ }
+
+ if (!test_filter && !list_only) {
+ has_uffd = test_uffd_api(false);
+ has_uffd |= test_uffd_api(true);
+
+ if (!has_uffd) {
+ printf("Userfaultfd not supported or unprivileged, skip all tests\n");
+ exit(KSFT_SKIP);
+ }
+ }
+
+ for (i = 0; i < n_tests; i++) {
+ test = &uffd_tests[i];
+ if (test_filter && !strstr(test->name, test_filter))
+ continue;
+ if (list_only) {
+ printf("%s\n", test->name);
+ continue;
+ }
+ for (j = 0; j < n_mems; j++) {
+ mem_type = &mem_types[j];
+ if (!(test->mem_targets & mem_type->mem_flag))
+ continue;
+
+ uffd_test_start("%s on %s", test->name, mem_type->name);
+ if ((mem_type->mem_flag == MEM_HUGETLB ||
+ mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
+ (default_huge_page_size() == 0)) {
+ uffd_test_skip("huge page size is 0, feature missing?");
+ continue;
+ }
+ if (!uffd_feature_supported(test)) {
+ uffd_test_skip("feature missing");
+ continue;
+ }
+ if (uffd_setup_environment(&args, test, mem_type,
+ &errmsg)) {
+ uffd_test_skip(errmsg);
+ continue;
+ }
+ test->uffd_fn(&args);
+ uffd_test_ctx_clear();
+ }
+ }
+
+ if (!list_only)
+ uffd_test_report();
+
+ return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS;
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ printf("Skipping %s (missing __NR_userfaultfd)\n", __file__);
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c
new file mode 100644
index 000000000000..c2ba7d46c7b4
--- /dev/null
+++ b/tools/testing/selftests/mm/uffd-wp-mremap.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include "../kselftest.h"
+#include "thp_settings.h"
+#include "uffd-common.h"
+
+static int pagemap_fd;
+static size_t pagesize;
+static int nr_pagesizes = 1;
+static int nr_thpsizes;
+static size_t thpsizes[20];
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+
+static int sz2ord(size_t size)
+{
+ return __builtin_ctzll(size / pagesize);
+}
+
+static int detect_thp_sizes(size_t sizes[], int max)
+{
+ int count = 0;
+ unsigned long orders;
+ size_t kb;
+ int i;
+
+ /* thp not supported at all. */
+ if (!read_pmd_pagesize())
+ return 0;
+
+ orders = thp_supported_orders();
+
+ for (i = 0; orders && count < max; i++) {
+ if (!(orders & (1UL << i)))
+ continue;
+ orders &= ~(1UL << i);
+ kb = (pagesize >> 10) << i;
+ sizes[count++] = kb * 1024;
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb);
+ }
+
+ return count;
+}
+
+static void *mmap_aligned(size_t size, int prot, int flags)
+{
+ size_t mmap_size = size * 2;
+ char *mmap_mem, *mem;
+
+ mmap_mem = mmap(NULL, mmap_size, prot, flags, -1, 0);
+ if (mmap_mem == MAP_FAILED)
+ return mmap_mem;
+
+ mem = (char *)(((uintptr_t)mmap_mem + size - 1) & ~(size - 1));
+ munmap(mmap_mem, mem - mmap_mem);
+ munmap(mem + size, mmap_mem + mmap_size - mem - size);
+
+ return mem;
+}
+
+static void *alloc_one_folio(size_t size, bool private, bool hugetlb)
+{
+ bool thp = !hugetlb && size > pagesize;
+ int flags = MAP_ANONYMOUS;
+ int prot = PROT_READ | PROT_WRITE;
+ char *mem, *addr;
+
+ assert((size & (size - 1)) == 0);
+
+ if (private)
+ flags |= MAP_PRIVATE;
+ else
+ flags |= MAP_SHARED;
+
+ /*
+ * For THP, we must explicitly enable the THP size, allocate twice the
+ * required space then manually align.
+ */
+ if (thp) {
+ struct thp_settings settings = *thp_current_settings();
+
+ if (private)
+ settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS;
+ else
+ settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS;
+
+ thp_push_settings(&settings);
+
+ mem = mmap_aligned(size, prot, flags);
+ } else {
+ if (hugetlb) {
+ flags |= MAP_HUGETLB;
+ flags |= __builtin_ctzll(size) << MAP_HUGE_SHIFT;
+ }
+
+ mem = mmap(NULL, size, prot, flags, -1, 0);
+ }
+
+ if (mem == MAP_FAILED) {
+ mem = NULL;
+ goto out;
+ }
+
+ assert(((uintptr_t)mem & (size - 1)) == 0);
+
+ /*
+ * Populate the folio by writing the first byte and check that all pages
+ * are populated. Finally set the whole thing to non-zero data to avoid
+ * kernel from mapping it back to the zero page.
+ */
+ mem[0] = 1;
+ for (addr = mem; addr < mem + size; addr += pagesize) {
+ if (!pagemap_is_populated(pagemap_fd, addr)) {
+ munmap(mem, size);
+ mem = NULL;
+ goto out;
+ }
+ }
+ memset(mem, 1, size);
+out:
+ if (thp)
+ thp_pop_settings();
+
+ return mem;
+}
+
+static bool check_uffd_wp_state(void *mem, size_t size, bool expect)
+{
+ uint64_t pte;
+ void *addr;
+
+ for (addr = mem; addr < mem + size; addr += pagesize) {
+ pte = pagemap_get_entry(pagemap_fd, addr);
+ if (!!(pte & PM_UFFD_WP) != expect) {
+ ksft_test_result_fail("uffd-wp not %s for pte %lu!\n",
+ expect ? "set" : "clear",
+ (addr - mem) / pagesize);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+ for (; size; addr += pagesize, size -= pagesize)
+ if (!pagemap_is_swapped(pagemap_fd, addr))
+ return false;
+ return true;
+}
+
+static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb)
+{
+ struct uffdio_writeprotect wp_prms;
+ uint64_t features = 0;
+ void *addr = NULL;
+ void *mem = NULL;
+
+ assert(!(hugetlb && swapout));
+
+ ksft_print_msg("[RUN] %s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n",
+ __func__,
+ size,
+ private ? "true" : "false",
+ swapout ? "true" : "false",
+ hugetlb ? "true" : "false");
+
+ /* Allocate a folio of required size and type. */
+ mem = alloc_one_folio(size, private, hugetlb);
+ if (!mem) {
+ ksft_test_result_fail("alloc_one_folio() failed\n");
+ goto out;
+ }
+
+ /* Register range for uffd-wp. */
+ if (userfaultfd_open(&features)) {
+ if (errno == ENOENT)
+ ksft_test_result_skip("userfaultfd not available\n");
+ else
+ ksft_test_result_fail("userfaultfd_open() failed\n");
+ goto out;
+ }
+ if (uffd_register(uffd, mem, size, false, true, false)) {
+ ksft_test_result_fail("uffd_register() failed\n");
+ goto out;
+ }
+ wp_prms.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+ wp_prms.range.start = (uintptr_t)mem;
+ wp_prms.range.len = size;
+ if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp_prms)) {
+ ksft_test_result_fail("ioctl(UFFDIO_WRITEPROTECT) failed\n");
+ goto out;
+ }
+
+ if (swapout) {
+ madvise(mem, size, MADV_PAGEOUT);
+ if (!range_is_swapped(mem, size)) {
+ ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ goto out;
+ }
+ }
+
+ /* Check that uffd-wp is set for all PTEs in range. */
+ if (!check_uffd_wp_state(mem, size, true))
+ goto out;
+
+ /*
+ * Move the mapping to a new, aligned location. Since
+ * UFFD_FEATURE_EVENT_REMAP is not set, we expect the uffd-wp bit for
+ * each PTE to be cleared in the new mapping.
+ */
+ addr = mmap_aligned(size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS);
+ if (addr == MAP_FAILED) {
+ ksft_test_result_fail("mmap_aligned() failed\n");
+ goto out;
+ }
+ if (mremap(mem, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, addr) == MAP_FAILED) {
+ ksft_test_result_fail("mremap() failed\n");
+ munmap(addr, size);
+ goto out;
+ }
+ mem = addr;
+
+ /* Check that uffd-wp is cleared for all PTEs in range. */
+ if (!check_uffd_wp_state(mem, size, false))
+ goto out;
+
+ ksft_test_result_pass("%s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n",
+ __func__,
+ size,
+ private ? "true" : "false",
+ swapout ? "true" : "false",
+ hugetlb ? "true" : "false");
+out:
+ if (mem)
+ munmap(mem, size);
+ if (uffd >= 0) {
+ close(uffd);
+ uffd = -1;
+ }
+}
+
+struct testcase {
+ size_t *sizes;
+ int *nr_sizes;
+ bool private;
+ bool swapout;
+ bool hugetlb;
+};
+
+static const struct testcase testcases[] = {
+ /* base pages. */
+ {
+ .sizes = &pagesize,
+ .nr_sizes = &nr_pagesizes,
+ .private = false,
+ .swapout = false,
+ .hugetlb = false,
+ },
+ {
+ .sizes = &pagesize,
+ .nr_sizes = &nr_pagesizes,
+ .private = true,
+ .swapout = false,
+ .hugetlb = false,
+ },
+ {
+ .sizes = &pagesize,
+ .nr_sizes = &nr_pagesizes,
+ .private = false,
+ .swapout = true,
+ .hugetlb = false,
+ },
+ {
+ .sizes = &pagesize,
+ .nr_sizes = &nr_pagesizes,
+ .private = true,
+ .swapout = true,
+ .hugetlb = false,
+ },
+
+ /* thp. */
+ {
+ .sizes = thpsizes,
+ .nr_sizes = &nr_thpsizes,
+ .private = false,
+ .swapout = false,
+ .hugetlb = false,
+ },
+ {
+ .sizes = thpsizes,
+ .nr_sizes = &nr_thpsizes,
+ .private = true,
+ .swapout = false,
+ .hugetlb = false,
+ },
+ {
+ .sizes = thpsizes,
+ .nr_sizes = &nr_thpsizes,
+ .private = false,
+ .swapout = true,
+ .hugetlb = false,
+ },
+ {
+ .sizes = thpsizes,
+ .nr_sizes = &nr_thpsizes,
+ .private = true,
+ .swapout = true,
+ .hugetlb = false,
+ },
+
+ /* hugetlb. */
+ {
+ .sizes = hugetlbsizes,
+ .nr_sizes = &nr_hugetlbsizes,
+ .private = false,
+ .swapout = false,
+ .hugetlb = true,
+ },
+ {
+ .sizes = hugetlbsizes,
+ .nr_sizes = &nr_hugetlbsizes,
+ .private = true,
+ .swapout = false,
+ .hugetlb = true,
+ },
+};
+
+int main(int argc, char **argv)
+{
+ struct thp_settings settings;
+ int i, j, plan = 0;
+
+ pagesize = getpagesize();
+ nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes));
+ nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
+ ARRAY_SIZE(hugetlbsizes));
+
+ /* If THP is supported, save THP settings and initially disable THP. */
+ if (nr_thpsizes) {
+ thp_save_settings();
+ thp_read_settings(&settings);
+ for (i = 0; i < NR_ORDERS; i++) {
+ settings.hugepages[i].enabled = THP_NEVER;
+ settings.shmem_hugepages[i].enabled = SHMEM_NEVER;
+ }
+ thp_push_settings(&settings);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(testcases); i++)
+ plan += *testcases[i].nr_sizes;
+ ksft_set_plan(plan);
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+
+ for (i = 0; i < ARRAY_SIZE(testcases); i++) {
+ const struct testcase *tc = &testcases[i];
+
+ for (j = 0; j < *tc->nr_sizes; j++)
+ test_one_folio(tc->sizes[j], tc->private, tc->swapout,
+ tc->hugetlb);
+ }
+
+ /* If THP is supported, restore original THP settings. */
+ if (nr_thpsizes)
+ thp_restore_settings();
+
+ i = ksft_get_fail_cnt();
+ if (i)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ i, ksft_test_num());
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
new file mode 100644
index 000000000000..896b3f73fc53
--- /dev/null
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+/*
+ * The hint addr value is used to allocate addresses
+ * beyond the high address switch boundary.
+ */
+
+#define ADDR_MARK_128TB (1UL << 47)
+#define ADDR_MARK_256TB (1UL << 48)
+
+#define HIGH_ADDR_128TB (1UL << 48)
+#define HIGH_ADDR_256TB (1UL << 49)
+
+struct testcase {
+ void *addr;
+ unsigned long size;
+ unsigned long flags;
+ const char *msg;
+ unsigned int low_addr_required:1;
+ unsigned int keep_mapped:1;
+};
+
+static struct testcase *testcases;
+static struct testcase *hugetlb_testcases;
+static int sz_testcases, sz_hugetlb_testcases;
+static unsigned long switch_hint;
+
+/* Initialize testcases inside a function to compute parameters at runtime */
+void testcases_init(void)
+{
+ unsigned long pagesize = getpagesize();
+ unsigned long hugepagesize = default_huge_page_size();
+ unsigned long low_addr = (1UL << 30);
+ unsigned long addr_switch_hint = ADDR_MARK_128TB;
+ unsigned long high_addr = HIGH_ADDR_128TB;
+
+#ifdef __aarch64__
+
+ /* Post LPA2, the lower userspace VA on a 16K pagesize is 47 bits. */
+ if (pagesize != (16UL << 10)) {
+ addr_switch_hint = ADDR_MARK_256TB;
+ high_addr = HIGH_ADDR_256TB;
+ }
+#endif
+
+ struct testcase t[] = {
+ {
+ /*
+ * If stack is moved, we could possibly allocate
+ * this at the requested address.
+ */
+ .addr = ((void *)(addr_switch_hint - pagesize)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, pagesize)",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * Unless MAP_FIXED is specified, allocation based on hint
+ * addr is never at requested address or above it, which is
+ * beyond high address switch boundary in this case. Instead,
+ * a suitable allocation is found in lower address space.
+ */
+ .addr = ((void *)(addr_switch_hint - pagesize)),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, (2 * pagesize))",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * Exact mapping at high address switch boundary, should
+ * be obtained even without MAP_FIXED as area is free.
+ */
+ .addr = ((void *)(addr_switch_hint)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint, pagesize)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED)",
+ },
+ {
+ .addr = NULL,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)low_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(low_addr)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(high_addr, MAP_FIXED)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1) again",
+ },
+ {
+ .addr = ((void *)(addr_switch_hint - pagesize)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, pagesize)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint - pagesize),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, 2 * pagesize)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint - pagesize / 2),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize/2 , 2 * pagesize)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = ((void *)(addr_switch_hint)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint, pagesize)",
+ },
+ {
+ .addr = (void *)(addr_switch_hint),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED)",
+ },
+ };
+
+ struct testcase ht[] = {
+ {
+ .addr = NULL,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)low_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(low_addr, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr, MAP_HUGETLB) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(high_addr, MAP_FIXED | MAP_HUGETLB)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB) again",
+ },
+ {
+ .addr = (void *)(addr_switch_hint - pagesize),
+ .size = 2 * hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, 2*hugepagesize, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint),
+ .size = 2 * hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB)",
+ },
+ };
+
+ testcases = malloc(sizeof(t));
+ hugetlb_testcases = malloc(sizeof(ht));
+
+ /* Copy into global arrays */
+ memcpy(testcases, t, sizeof(t));
+ memcpy(hugetlb_testcases, ht, sizeof(ht));
+
+ sz_testcases = ARRAY_SIZE(t);
+ sz_hugetlb_testcases = ARRAY_SIZE(ht);
+ switch_hint = addr_switch_hint;
+}
+
+static int run_test(struct testcase *test, int count)
+{
+ void *p;
+ int i, ret = KSFT_PASS;
+
+ for (i = 0; i < count; i++) {
+ struct testcase *t = test + i;
+
+ p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+ printf("%s: %p - ", t->msg, p);
+
+ if (p == MAP_FAILED) {
+ printf("FAILED\n");
+ ret = KSFT_FAIL;
+ continue;
+ }
+
+ if (t->low_addr_required && p >= (void *)(switch_hint)) {
+ printf("FAILED\n");
+ ret = KSFT_FAIL;
+ } else {
+ /*
+ * Do a dereference of the address returned so that we catch
+ * bugs in page fault handling
+ */
+ memset(p, 0, t->size);
+ printf("OK\n");
+ }
+ if (!t->keep_mapped)
+ munmap(p, t->size);
+ }
+
+ return ret;
+}
+
+#ifdef __aarch64__
+/* Check if userspace VA > 48 bits */
+static int high_address_present(void)
+{
+ void *ptr = mmap((void *)(1UL << 50), 1, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (ptr == MAP_FAILED)
+ return 0;
+
+ munmap(ptr, 1);
+ return 1;
+}
+#endif
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+ return 1;
+#elif defined(__x86_64__)
+ return 1;
+#elif defined(__aarch64__)
+ return high_address_present();
+#else
+ return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+ int ret;
+
+ if (!supported_arch())
+ return KSFT_SKIP;
+
+ testcases_init();
+
+ ret = run_test(testcases, sz_testcases);
+ if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+ ret = run_test(hugetlb_testcases, sz_hugetlb_testcases);
+ return ret;
+}
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
new file mode 100755
index 000000000000..325de53966b6
--- /dev/null
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2022 Adam Sindelar (Meta) <adam@wowsignal.io>
+#
+# This is a test for mmap behavior with 5-level paging. This script wraps the
+# real test to check that the kernel is configured to support at least 5
+# pagetable levels.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+skip()
+{
+ echo "$1"
+ exit $ksft_skip
+}
+
+check_supported_x86_64()
+{
+ local config="/proc/config.gz"
+ [[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
+ [[ -f "${config}" ]] || skip "Cannot find kernel config in /proc or /boot"
+
+ # gzip -dcfq automatically handles both compressed and plaintext input.
+ # See man 1 gzip under '-f'.
+ local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
+
+ local cpu_supports_pl5=$(awk '/^flags/ {if (/la57/) {print 0;}
+ else {print 1}; exit}' /proc/cpuinfo 2>/dev/null)
+
+ if [[ "${pg_table_levels}" -lt 5 ]]; then
+ skip "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
+ elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then
+ skip "$0: CPU does not have the necessary la57 flag to support page table level 5"
+ fi
+}
+
+check_supported_ppc64()
+{
+ local config="/proc/config.gz"
+ [[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
+ [[ -f "${config}" ]] || skip "Cannot find kernel config in /proc or /boot"
+
+ local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
+ if [[ "${pg_table_levels}" -lt 5 ]]; then
+ skip "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
+ fi
+
+ local mmu_support=$(grep -m1 "mmu" /proc/cpuinfo | awk '{print $3}')
+ if [[ "$mmu_support" != "radix" ]]; then
+ skip "$0: System does not use Radix MMU, required for 5-level paging"
+ fi
+
+ local hugepages_total=$(awk '/HugePages_Total/ {print $2}' /proc/meminfo)
+ if [[ "${hugepages_total}" -eq 0 ]]; then
+ skip "$0: HugePages are not enabled, required for some tests"
+ fi
+}
+
+check_test_requirements()
+{
+ # The test supports x86_64 and powerpc64. We currently have no useful
+ # eligibility check for powerpc64, and the test itself will reject other
+ # architectures.
+ case `uname -m` in
+ "x86_64")
+ check_supported_x86_64
+ ;;
+ "ppc64le"|"ppc64")
+ check_supported_ppc64
+ ;;
+ *)
+ return 0
+ ;;
+ esac
+}
+
+check_test_requirements
+./va_high_addr_switch --run-hugetlb
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
new file mode 100644
index 000000000000..b380e102b22f
--- /dev/null
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Anshuman Khandual, IBM Corp.
+ *
+ * Works on architectures which support 128TB virtual
+ * address range and beyond.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/prctl.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <fcntl.h>
+
+#include "vm_util.h"
+#include "../kselftest.h"
+
+/*
+ * Maximum address range mapped with a single mmap()
+ * call is little bit more than 1GB. Hence 1GB is
+ * chosen as the single chunk size for address space
+ * mapping.
+ */
+
+#define SZ_1GB (1024 * 1024 * 1024UL)
+#define SZ_1TB (1024 * 1024 * 1024 * 1024UL)
+
+#define MAP_CHUNK_SIZE SZ_1GB
+
+/*
+ * Address space till 128TB is mapped without any hint
+ * and is enabled by default. Address space beyond 128TB
+ * till 512TB is obtained by passing hint address as the
+ * first argument into mmap() system call.
+ *
+ * The process heap address space is divided into two
+ * different areas one below 128TB and one above 128TB
+ * till it reaches 512TB. One with size 128TB and the
+ * other being 384TB.
+ *
+ * On Arm64 the address space is 256TB and support for
+ * high mappings up to 4PB virtual address space has
+ * been added.
+ */
+
+#define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */
+#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL)
+#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL)
+#define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL)
+
+#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */
+#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */
+
+#ifdef __aarch64__
+#define HIGH_ADDR_MARK ADDR_MARK_256TB
+#define HIGH_ADDR_SHIFT 49
+#define NR_CHUNKS_LOW NR_CHUNKS_256TB
+#define NR_CHUNKS_HIGH NR_CHUNKS_3840TB
+#else
+#define HIGH_ADDR_MARK ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH NR_CHUNKS_384TB
+#endif
+
+static char *hint_addr(void)
+{
+ int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
+
+ return (char *) (1UL << bits);
+}
+
+static void validate_addr(char *ptr, int high_addr)
+{
+ unsigned long addr = (unsigned long) ptr;
+
+ if (high_addr && addr < HIGH_ADDR_MARK)
+ ksft_exit_fail_msg("Bad address %lx\n", addr);
+
+ if (addr > HIGH_ADDR_MARK)
+ ksft_exit_fail_msg("Bad address %lx\n", addr);
+}
+
+static void mark_range(char *ptr, size_t size)
+{
+ if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) {
+ if (errno == EINVAL) {
+ /* Depends on CONFIG_ANON_VMA_NAME */
+ ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n");
+ ksft_finished();
+ } else {
+ ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n");
+ }
+ }
+}
+
+static int is_marked_vma(const char *vma_name)
+{
+ return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n");
+}
+
+static int validate_lower_address_hint(void)
+{
+ char *ptr;
+
+ ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
+ PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr == MAP_FAILED)
+ return 0;
+
+ return 1;
+}
+
+static int validate_complete_va_space(void)
+{
+ unsigned long start_addr, end_addr, prev_end_addr;
+ char line[400];
+ char prot[6];
+ FILE *file;
+ int fd;
+
+ fd = open("va_dump", O_CREAT | O_WRONLY, 0600);
+ unlink("va_dump");
+ if (fd < 0) {
+ ksft_test_result_skip("cannot create or open dump file\n");
+ ksft_finished();
+ }
+
+ file = fopen("/proc/self/maps", "r");
+ if (file == NULL)
+ ksft_exit_fail_msg("cannot open /proc/self/maps\n");
+
+ prev_end_addr = 0;
+ while (fgets(line, sizeof(line), file)) {
+ const char *vma_name = NULL;
+ int vma_name_start = 0;
+ unsigned long hop;
+
+ if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n",
+ &start_addr, &end_addr, prot, &vma_name_start) != 3)
+ ksft_exit_fail_msg("cannot parse /proc/self/maps\n");
+
+ if (vma_name_start)
+ vma_name = line + vma_name_start;
+
+ /* end of userspace mappings; ignore vsyscall mapping */
+ if (start_addr & (1UL << 63))
+ return 0;
+
+ /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */
+ if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE)
+ return 1;
+
+ prev_end_addr = end_addr;
+
+ if (prot[0] != 'r')
+ continue;
+
+ if (check_vmflag_io((void *)start_addr))
+ continue;
+
+ /*
+ * Confirm whether MAP_CHUNK_SIZE chunk can be found or not.
+ * If write succeeds, no need to check MAP_CHUNK_SIZE - 1
+ * addresses after that. If the address was not held by this
+ * process, write would fail with errno set to EFAULT.
+ * Anyways, if write returns anything apart from 1, exit the
+ * program since that would mean a bug in /proc/self/maps.
+ */
+ hop = 0;
+ while (start_addr + hop < end_addr) {
+ if (write(fd, (void *)(start_addr + hop), 1) != 1)
+ return 1;
+ lseek(fd, 0, SEEK_SET);
+
+ if (is_marked_vma(vma_name))
+ munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE);
+
+ hop += MAP_CHUNK_SIZE;
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ char *ptr[NR_CHUNKS_LOW];
+ char **hptr;
+ char *hint;
+ unsigned long i, lchunks, hchunks;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ for (i = 0; i < NR_CHUNKS_LOW; i++) {
+ ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr[i] == MAP_FAILED) {
+ if (validate_lower_address_hint())
+ ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n");
+ break;
+ }
+
+ mark_range(ptr[i], MAP_CHUNK_SIZE);
+ validate_addr(ptr[i], 0);
+ }
+ lchunks = i;
+ hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
+ if (hptr == NULL) {
+ ksft_test_result_skip("Memory constraint not fulfilled\n");
+ ksft_finished();
+ }
+
+ for (i = 0; i < NR_CHUNKS_HIGH; i++) {
+ hint = hint_addr();
+ hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (hptr[i] == MAP_FAILED)
+ break;
+
+ mark_range(ptr[i], MAP_CHUNK_SIZE);
+ validate_addr(hptr[i], 1);
+ }
+ hchunks = i;
+ if (validate_complete_va_space()) {
+ ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n");
+ ksft_finished();
+ }
+
+ for (i = 0; i < lchunks; i++)
+ munmap(ptr[i], MAP_CHUNK_SIZE);
+
+ for (i = 0; i < hchunks; i++)
+ munmap(hptr[i], MAP_CHUNK_SIZE);
+
+ free(hptr);
+
+ ksft_test_result_pass("Test\n");
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
new file mode 100644
index 000000000000..5492e3f784df
--- /dev/null
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+#include <linux/fs.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SMAP_FILE_PATH "/proc/self/smaps"
+#define STATUS_FILE_PATH "/proc/self/status"
+#define MAX_LINE_LENGTH 500
+
+unsigned int __page_size;
+unsigned int __page_shift;
+
+uint64_t pagemap_get_entry(int fd, char *start)
+{
+ const unsigned long pfn = (unsigned long)start / getpagesize();
+ uint64_t entry;
+ int ret;
+
+ ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
+ if (ret != sizeof(entry))
+ ksft_exit_fail_msg("reading pagemap failed\n");
+ return entry;
+}
+
+static uint64_t __pagemap_scan_get_categories(int fd, char *start, struct page_region *r)
+{
+ struct pm_scan_arg arg;
+
+ arg.start = (uintptr_t)start;
+ arg.end = (uintptr_t)(start + psize());
+ arg.vec = (uintptr_t)r;
+ arg.vec_len = 1;
+ arg.flags = 0;
+ arg.size = sizeof(struct pm_scan_arg);
+ arg.max_pages = 0;
+ arg.category_inverted = 0;
+ arg.category_mask = 0;
+ arg.category_anyof_mask = PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | PAGE_IS_FILE |
+ PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY;
+ arg.return_mask = arg.category_anyof_mask;
+
+ return ioctl(fd, PAGEMAP_SCAN, &arg);
+}
+
+static uint64_t pagemap_scan_get_categories(int fd, char *start)
+{
+ struct page_region r;
+ long ret;
+
+ ret = __pagemap_scan_get_categories(fd, start, &r);
+ if (ret < 0)
+ ksft_exit_fail_msg("PAGEMAP_SCAN failed: %s\n", strerror(errno));
+ if (ret == 0)
+ return 0;
+ return r.categories;
+}
+
+/* `start` is any valid address. */
+static bool pagemap_scan_supported(int fd, char *start)
+{
+ static int supported = -1;
+ int ret;
+
+ if (supported != -1)
+ return supported;
+
+ /* Provide an invalid address in order to trigger EFAULT. */
+ ret = __pagemap_scan_get_categories(fd, start, (struct page_region *) ~0UL);
+ if (ret == 0)
+ ksft_exit_fail_msg("PAGEMAP_SCAN succeeded unexpectedly\n");
+
+ supported = errno == EFAULT;
+
+ return supported;
+}
+
+static bool page_entry_is(int fd, char *start, char *desc,
+ uint64_t pagemap_flags, uint64_t pagescan_flags)
+{
+ bool m = pagemap_get_entry(fd, start) & pagemap_flags;
+
+ if (pagemap_scan_supported(fd, start)) {
+ bool s = pagemap_scan_get_categories(fd, start) & pagescan_flags;
+
+ if (m == s)
+ return m;
+
+ ksft_exit_fail_msg(
+ "read and ioctl return unmatched results for %s: %d %d", desc, m, s);
+ }
+ return m;
+}
+
+bool pagemap_is_softdirty(int fd, char *start)
+{
+ return page_entry_is(fd, start, "soft-dirty",
+ PM_SOFT_DIRTY, PAGE_IS_SOFT_DIRTY);
+}
+
+bool pagemap_is_swapped(int fd, char *start)
+{
+ return page_entry_is(fd, start, "swap", PM_SWAP, PAGE_IS_SWAPPED);
+}
+
+bool pagemap_is_populated(int fd, char *start)
+{
+ return page_entry_is(fd, start, "populated",
+ PM_PRESENT | PM_SWAP,
+ PAGE_IS_PRESENT | PAGE_IS_SWAPPED);
+}
+
+unsigned long pagemap_get_pfn(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ /* If present (63th bit), PFN is at bit 0 -- 54. */
+ if (entry & PM_PRESENT)
+ return entry & 0x007fffffffffffffull;
+ return -1ul;
+}
+
+void clear_softdirty(void)
+{
+ int ret;
+ const char *ctrl = "4";
+ int fd = open("/proc/self/clear_refs", O_WRONLY);
+
+ if (fd < 0)
+ ksft_exit_fail_msg("opening clear_refs failed\n");
+ ret = write(fd, ctrl, strlen(ctrl));
+ close(fd);
+ if (ret != (signed int)strlen(ctrl))
+ ksft_exit_fail_msg("writing clear_refs failed\n");
+}
+
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len)
+{
+ while (fgets(buf, len, fp)) {
+ if (!strncmp(buf, pattern, strlen(pattern)))
+ return true;
+ }
+ return false;
+}
+
+uint64_t read_pmd_pagesize(void)
+{
+ int fd;
+ char buf[20];
+ ssize_t num_read;
+
+ fd = open(PMD_SIZE_FILE_PATH, O_RDONLY);
+ if (fd == -1)
+ return 0;
+
+ num_read = read(fd, buf, 19);
+ if (num_read < 1) {
+ close(fd);
+ return 0;
+ }
+ buf[num_read] = '\0';
+ close(fd);
+
+ return strtoul(buf, NULL, 10);
+}
+
+unsigned long rss_anon(void)
+{
+ unsigned long rss_anon = 0;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+
+ fp = fopen(STATUS_FILE_PATH, "r");
+ if (!fp)
+ ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
+
+ if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
+ goto err_out;
+
+ if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
+ ksft_exit_fail_msg("Reading status error\n");
+
+err_out:
+ fclose(fp);
+ return rss_anon;
+}
+
+char *__get_smap_entry(void *addr, const char *pattern, char *buf, size_t len)
+{
+ int ret;
+ FILE *fp;
+ char *entry = NULL;
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH)
+ ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
+
+ fp = fopen(SMAP_FILE_PATH, "r");
+ if (!fp)
+ ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
+
+ if (!check_for_pattern(fp, addr_pattern, buf, len))
+ goto err_out;
+
+ /* Fetch the pattern in the same block */
+ if (!check_for_pattern(fp, pattern, buf, len))
+ goto err_out;
+
+ /* Trim trailing newline */
+ entry = strchr(buf, '\n');
+ if (entry)
+ *entry = '\0';
+
+ entry = buf + strlen(pattern);
+
+err_out:
+ fclose(fp);
+ return entry;
+}
+
+bool __check_huge(void *addr, char *pattern, int nr_hpages,
+ uint64_t hpage_size)
+{
+ char buffer[MAX_LINE_LENGTH];
+ uint64_t thp = -1;
+ char *entry;
+
+ entry = __get_smap_entry(addr, pattern, buffer, sizeof(buffer));
+ if (!entry)
+ goto err_out;
+
+ if (sscanf(entry, "%9" SCNu64 " kB", &thp) != 1)
+ ksft_exit_fail_msg("Reading smap error\n");
+
+err_out:
+ return thp == (nr_hpages * (hpage_size >> 10));
+}
+
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+ return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size);
+}
+
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+ return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size);
+}
+
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+ return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
+}
+
+int64_t allocate_transhuge(void *ptr, int pagemap_fd)
+{
+ uint64_t ent[2];
+
+ /* drop pmd */
+ if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANONYMOUS |
+ MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+ ksft_exit_fail_msg("mmap transhuge\n");
+
+ if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+ ksft_exit_fail_msg("MADV_HUGEPAGE\n");
+
+ /* allocate transparent huge page */
+ *(volatile void **)ptr = ptr;
+
+ if (pread(pagemap_fd, ent, sizeof(ent),
+ (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent))
+ ksft_exit_fail_msg("read pagemap\n");
+
+ if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+ PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+ !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - pshift())) - 1)))
+ return PAGEMAP_PFN(ent[0]);
+
+ return -1;
+}
+
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+int detect_hugetlb_page_sizes(size_t sizes[], int max)
+{
+ DIR *dir = opendir("/sys/kernel/mm/hugepages/");
+ int count = 0;
+
+ if (!dir)
+ return 0;
+
+ while (count < max) {
+ struct dirent *entry = readdir(dir);
+ size_t kb;
+
+ if (!entry)
+ break;
+ if (entry->d_type != DT_DIR)
+ continue;
+ if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
+ continue;
+ sizes[count++] = kb * 1024;
+ ksft_print_msg("[INFO] detected hugetlb page size: %zu KiB\n",
+ kb);
+ }
+ closedir(dir);
+ return count;
+}
+
+/* If `ioctls' non-NULL, the allowed ioctls will be returned into the var */
+int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor, uint64_t *ioctls)
+{
+ struct uffdio_register uffdio_register = { 0 };
+ uint64_t mode = 0;
+ int ret = 0;
+
+ if (miss)
+ mode |= UFFDIO_REGISTER_MODE_MISSING;
+ if (wp)
+ mode |= UFFDIO_REGISTER_MODE_WP;
+ if (minor)
+ mode |= UFFDIO_REGISTER_MODE_MINOR;
+
+ uffdio_register.range.start = (unsigned long)addr;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = mode;
+
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
+ ret = -errno;
+ else if (ioctls)
+ *ioctls = uffdio_register.ioctls;
+
+ return ret;
+}
+
+int uffd_register(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor)
+{
+ return uffd_register_with_ioctls(uffd, addr, len,
+ miss, wp, minor, NULL);
+}
+
+int uffd_unregister(int uffd, void *addr, uint64_t len)
+{
+ struct uffdio_range range = { .start = (uintptr_t)addr, .len = len };
+ int ret = 0;
+
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &range) == -1)
+ ret = -errno;
+
+ return ret;
+}
+
+unsigned long get_free_hugepages(void)
+{
+ unsigned long fhp = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return fhp;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1)
+ break;
+ }
+
+ free(line);
+ fclose(f);
+ return fhp;
+}
+
+bool check_vmflag_io(void *addr)
+{
+ char buffer[MAX_LINE_LENGTH];
+ const char *flags;
+ size_t flaglen;
+
+ flags = __get_smap_entry(addr, "VmFlags:", buffer, sizeof(buffer));
+ if (!flags)
+ ksft_exit_fail_msg("%s: No VmFlags for %p\n", __func__, addr);
+
+ while (true) {
+ flags += strspn(flags, " ");
+
+ flaglen = strcspn(flags, " ");
+ if (!flaglen)
+ return false;
+
+ if (flaglen == strlen("io") && !memcmp(flags, "io", flaglen))
+ return true;
+
+ flags += flaglen;
+ }
+}
+
+/*
+ * Open an fd at /proc/$pid/maps and configure procmap_out ready for
+ * PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise.
+ */
+int open_procmap(pid_t pid, struct procmap_fd *procmap_out)
+{
+ char path[256];
+ int ret = 0;
+
+ memset(procmap_out, '\0', sizeof(*procmap_out));
+ sprintf(path, "/proc/%d/maps", pid);
+ procmap_out->query.size = sizeof(procmap_out->query);
+ procmap_out->fd = open(path, O_RDONLY);
+ if (procmap_out->fd < 0)
+ ret = -errno;
+
+ return ret;
+}
+
+/* Perform PROCMAP_QUERY. Returns 0 on success, or an error code otherwise. */
+int query_procmap(struct procmap_fd *procmap)
+{
+ int ret = 0;
+
+ if (ioctl(procmap->fd, PROCMAP_QUERY, &procmap->query) == -1)
+ ret = -errno;
+
+ return ret;
+}
+
+/*
+ * Try to find the VMA at specified address, returns true if found, false if not
+ * found, and the test is failed if any other error occurs.
+ *
+ * On success, procmap->query is populated with the results.
+ */
+bool find_vma_procmap(struct procmap_fd *procmap, void *address)
+{
+ int err;
+
+ procmap->query.query_flags = 0;
+ procmap->query.query_addr = (unsigned long)address;
+ err = query_procmap(procmap);
+ if (!err)
+ return true;
+
+ if (err != -ENOENT)
+ ksft_exit_fail_msg("%s: Error %d on ioctl(PROCMAP_QUERY)\n",
+ __func__, err);
+ return false;
+}
+
+/*
+ * Close fd used by PROCMAP_QUERY mechanism. Returns 0 on success, or an error
+ * code otherwise.
+ */
+int close_procmap(struct procmap_fd *procmap)
+{
+ return close(procmap->fd);
+}
+
+int write_sysfs(const char *file_path, unsigned long val)
+{
+ FILE *f = fopen(file_path, "w");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fprintf(f, "%lu", val) < 0) {
+ perror("fprintf");
+ fclose(f);
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
+
+int read_sysfs(const char *file_path, unsigned long *val)
+{
+ FILE *f = fopen(file_path, "r");
+
+ if (!f) {
+ fprintf(stderr, "f %s\n", file_path);
+ perror("fopen");
+ return 1;
+ }
+ if (fscanf(f, "%lu", val) != 1) {
+ perror("fscanf");
+ fclose(f);
+ return 1;
+ }
+ fclose(f);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
new file mode 100644
index 000000000000..b8136d12a0f8
--- /dev/null
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdint.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stdarg.h>
+#include <strings.h> /* ffsl() */
+#include <unistd.h> /* _SC_PAGESIZE */
+#include "../kselftest.h"
+#include <linux/fs.h>
+
+#define BIT_ULL(nr) (1ULL << (nr))
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
+#define PM_GUARD_REGION BIT_ULL(58)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
+extern unsigned int __page_size;
+extern unsigned int __page_shift;
+
+/*
+ * Represents an open fd and PROCMAP_QUERY state for binary (via ioctl)
+ * /proc/$pid/[s]maps lookup.
+ */
+struct procmap_fd {
+ int fd;
+ struct procmap_query query;
+};
+
+static inline unsigned int psize(void)
+{
+ if (!__page_size)
+ __page_size = sysconf(_SC_PAGESIZE);
+ return __page_size;
+}
+
+static inline unsigned int pshift(void)
+{
+ if (!__page_shift)
+ __page_shift = (ffsl(psize()) - 1);
+ return __page_shift;
+}
+
+/*
+ * Plan 9 FS has bugs (at least on QEMU) where certain operations fail with
+ * ENOENT on unlinked files. See
+ * https://gitlab.com/qemu-project/qemu/-/issues/103 for some info about such
+ * bugs. There are rumours of NFS implementations with similar bugs.
+ *
+ * Ideally, tests should just detect filesystems known to have such issues and
+ * bail early. But 9pfs has the additional "feature" that it causes fstatfs to
+ * pass through the f_type field from the host filesystem. To avoid having to
+ * scrape /proc/mounts or some other hackery, tests can call this function when
+ * it seems such a bug might have been encountered.
+ */
+static inline void skip_test_dodgy_fs(const char *op_name)
+{
+ ksft_test_result_skip("%s failed with ENOENT. Filesystem might be buggy (9pfs?)\n", op_name);
+}
+
+uint64_t pagemap_get_entry(int fd, char *start);
+bool pagemap_is_softdirty(int fd, char *start);
+bool pagemap_is_swapped(int fd, char *start);
+bool pagemap_is_populated(int fd, char *start);
+unsigned long pagemap_get_pfn(int fd, char *start);
+void clear_softdirty(void);
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
+uint64_t read_pmd_pagesize(void);
+unsigned long rss_anon(void);
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
+int64_t allocate_transhuge(void *ptr, int pagemap_fd);
+unsigned long default_huge_page_size(void);
+int detect_hugetlb_page_sizes(size_t sizes[], int max);
+
+int uffd_register(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor);
+int uffd_unregister(int uffd, void *addr, uint64_t len);
+int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len,
+ bool miss, bool wp, bool minor, uint64_t *ioctls);
+unsigned long get_free_hugepages(void);
+bool check_vmflag_io(void *addr);
+int open_procmap(pid_t pid, struct procmap_fd *procmap_out);
+int query_procmap(struct procmap_fd *procmap);
+bool find_vma_procmap(struct procmap_fd *procmap, void *address);
+int close_procmap(struct procmap_fd *procmap);
+int write_sysfs(const char *file_path, unsigned long val);
+int read_sysfs(const char *file_path, unsigned long *val);
+
+static inline int open_self_procmap(struct procmap_fd *procmap_out)
+{
+ pid_t pid = getpid();
+
+ return open_procmap(pid, procmap_out);
+}
+
+/* These helpers need to be inline to match the kselftest.h idiom. */
+static char test_name[1024];
+
+static inline void log_test_start(const char *name, ...)
+{
+ va_list args;
+ va_start(args, name);
+
+ vsnprintf(test_name, sizeof(test_name), name, args);
+ ksft_print_msg("[RUN] %s\n", test_name);
+
+ va_end(args);
+}
+
+static inline void log_test_result(int result)
+{
+ ksft_test_result_report(result, "%s\n", test_name);
+}
+
+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
+#define HPAGE_SHIFT 21
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh
new file mode 100755
index 000000000000..3d2d2eb9d6ff
--- /dev/null
+++ b/tools/testing/selftests/mm/write_hugetlb_memory.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+size=$1
+populate=$2
+write=$3
+cgroup=$4
+path=$5
+method=$6
+private=$7
+want_sleep=$8
+reserve=$9
+
+echo "Putting task in cgroup '$cgroup'"
+echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
+
+echo "Method is $method"
+
+set +e
+./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \
+ "$private" "$want_sleep" "$reserve"
diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
new file mode 100644
index 000000000000..34c91f7e6128
--- /dev/null
+++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program reserves and uses hugetlb memory, supporting a bunch of
+ * scenarios needed by the charged_reserved_hugetlb.sh test.
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+enum method {
+ HUGETLBFS,
+ MMAP_MAP_HUGETLB,
+ SHM,
+ MAX_METHOD
+};
+
+
+/* Global variables. */
+static const char *self;
+static int *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+ printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> "
+ "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] "
+ "[-o] [-w] [-n]\n",
+ self);
+ exit(EXIT_FAILURE);
+}
+
+void sig_handler(int signo)
+{
+ printf("Received %d.\n", signo);
+ if (signo == SIGINT) {
+ if (shmaddr) {
+ printf("Deleting the memory\n");
+ if (shmdt((const void *)shmaddr) != 0) {
+ perror("Detach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(4);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+ printf("Done deleting the memory\n");
+ }
+ }
+ exit(2);
+}
+
+int main(int argc, char **argv)
+{
+ int fd = 0;
+ int key = 0;
+ int *ptr = NULL;
+ int c = 0;
+ int size = 0;
+ char path[256] = "";
+ enum method method = MAX_METHOD;
+ int want_sleep = 0, private = 0;
+ int populate = 0;
+ int write = 0;
+ int reserve = 1;
+
+ if (signal(SIGINT, sig_handler) == SIG_ERR)
+ err(1, "\ncan't catch SIGINT\n");
+
+ /* Parse command-line arguments. */
+ setvbuf(stdout, NULL, _IONBF, 0);
+ self = argv[0];
+
+ while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
+ switch (c) {
+ case 's':
+ size = atoi(optarg);
+ break;
+ case 'p':
+ strncpy(path, optarg, sizeof(path) - 1);
+ break;
+ case 'm':
+ if (atoi(optarg) >= MAX_METHOD) {
+ errno = EINVAL;
+ perror("Invalid -m.");
+ exit_usage();
+ }
+ method = atoi(optarg);
+ break;
+ case 'o':
+ populate = 1;
+ break;
+ case 'w':
+ write = 1;
+ break;
+ case 'l':
+ want_sleep = 1;
+ break;
+ case 'r':
+ private
+ = 1;
+ break;
+ case 'n':
+ reserve = 0;
+ break;
+ default:
+ errno = EINVAL;
+ perror("Invalid arg");
+ exit_usage();
+ }
+ }
+
+ if (strncmp(path, "", sizeof(path)) != 0) {
+ printf("Writing to this path: %s\n", path);
+ } else {
+ errno = EINVAL;
+ perror("path not found");
+ exit_usage();
+ }
+
+ if (size != 0) {
+ printf("Writing this size: %d\n", size);
+ } else {
+ errno = EINVAL;
+ perror("size not found");
+ exit_usage();
+ }
+
+ if (!populate)
+ printf("Not populating.\n");
+ else
+ printf("Populating.\n");
+
+ if (!write)
+ printf("Not writing to memory.\n");
+
+ if (method == MAX_METHOD) {
+ errno = EINVAL;
+ perror("-m Invalid");
+ exit_usage();
+ } else
+ printf("Using method=%d\n", method);
+
+ if (!private)
+ printf("Shared mapping.\n");
+ else
+ printf("Private mapping.\n");
+
+ if (!reserve)
+ printf("NO_RESERVE mapping.\n");
+ else
+ printf("RESERVE mapping.\n");
+
+ switch (method) {
+ case HUGETLBFS:
+ printf("Allocating using HUGETLBFS.\n");
+ fd = open(path, O_CREAT | O_RDWR, 0777);
+ if (fd == -1)
+ err(1, "Failed to open file.");
+
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (private ? MAP_PRIVATE : MAP_SHARED) |
+ (populate ? MAP_POPULATE : 0) |
+ (reserve ? 0 : MAP_NORESERVE),
+ fd, 0);
+
+ if (ptr == MAP_FAILED) {
+ close(fd);
+ err(1, "Error mapping the file");
+ }
+ break;
+ case MMAP_MAP_HUGETLB:
+ printf("Allocating using MAP_HUGETLB.\n");
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (private ? (MAP_PRIVATE | MAP_ANONYMOUS) :
+ MAP_SHARED) |
+ MAP_HUGETLB | (populate ? MAP_POPULATE : 0) |
+ (reserve ? 0 : MAP_NORESERVE),
+ -1, 0);
+
+ if (ptr == MAP_FAILED)
+ err(1, "mmap");
+
+ printf("Returned address is %p\n", ptr);
+ break;
+ case SHM:
+ printf("Allocating using SHM.\n");
+ shmid = shmget(key, size,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0) {
+ shmid = shmget(++key, size,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0)
+ err(1, "shmget");
+ }
+ printf("shmid: 0x%x, shmget key:%d\n", shmid, key);
+
+ ptr = shmat(shmid, NULL, 0);
+ if (ptr == (int *)-1) {
+ perror("Shared memory attach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(2);
+ }
+ shmaddr = ptr;
+ printf("shmaddr: %p\n", shmaddr);
+
+ break;
+ default:
+ errno = EINVAL;
+ err(1, "Invalid method.");
+ }
+
+ if (write) {
+ printf("Writing to memory.\n");
+ memset(ptr, 1, size);
+ }
+
+ if (want_sleep) {
+ /* Signal to caller that we're done. */
+ printf("DONE\n");
+
+ /* Hold memory until external kill signal is delivered. */
+ while (1)
+ sleep(100);
+ }
+
+ if (method == HUGETLBFS)
+ close(fd);
+
+ return 0;
+}