aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rw-r--r--tools/arch/arm64/include/asm/cputype.h10
-rw-r--r--tools/arch/powerpc/include/uapi/asm/kvm.h3
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h803
-rw-r--r--tools/arch/x86/include/asm/msr-index.h11
-rw-r--r--tools/arch/x86/include/uapi/asm/kvm.h49
-rw-r--r--tools/arch/x86/include/uapi/asm/svm.h1
-rw-r--r--tools/bpf/bpftool/prog.c2
-rw-r--r--tools/bpf/resolve_btfids/main.c2
-rw-r--r--tools/build/Makefile.feature18
-rw-r--r--tools/build/feature/Makefile53
-rw-r--r--tools/include/asm/rwonce.h0
-rw-r--r--tools/include/linux/bitmap.h17
-rw-r--r--tools/include/uapi/README73
-rw-r--r--tools/include/uapi/asm-generic/unistd.h2
-rw-r--r--tools/include/uapi/drm/i915_drm.h27
-rw-r--r--tools/include/uapi/linux/fs.h552
-rw-r--r--tools/include/uapi/linux/if_xdp.h4
-rw-r--r--tools/include/uapi/linux/in.h2
-rw-r--r--tools/include/uapi/linux/kvm.h27
-rw-r--r--tools/include/uapi/linux/mman.h1
-rw-r--r--tools/include/uapi/linux/perf_event.h6
-rw-r--r--tools/include/uapi/linux/prctl.h331
-rw-r--r--tools/include/uapi/linux/stat.h12
-rw-r--r--tools/lib/bitmap.c20
-rw-r--r--tools/lib/bpf/btf_dump.c8
-rw-r--r--tools/lib/list_sort.c10
-rw-r--r--tools/mm/Makefile2
-rw-r--r--tools/mm/thp_swap_allocator_test.c234
-rw-r--r--tools/objtool/check.c2
-rw-r--r--tools/perf/Documentation/Build.txt28
-rw-r--r--tools/perf/Makefile.config33
-rw-r--r--tools/perf/Makefile.perf27
-rw-r--r--tools/perf/arch/loongarch/Makefile1
-rw-r--r--tools/perf/arch/loongarch/util/Build2
-rw-r--r--tools/perf/arch/loongarch/util/header.c96
-rw-r--r--tools/perf/arch/loongarch/util/kvm-stat.c139
-rw-r--r--tools/perf/arch/powerpc/entry/syscalls/syscall.tbl6
-rw-r--r--tools/perf/arch/riscv/Makefile1
-rw-r--r--tools/perf/arch/riscv/util/Build1
-rw-r--r--tools/perf/arch/riscv/util/kvm-stat.c78
-rw-r--r--tools/perf/arch/riscv/util/riscv_exception_types.h35
-rw-r--r--tools/perf/arch/s390/entry/syscalls/syscall.tbl2
-rw-r--r--tools/perf/arch/x86/entry/syscalls/syscall_64.tbl8
-rw-r--r--tools/perf/builtin-daemon.c9
-rw-r--r--tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json2
-rw-r--r--tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json2
-rw-r--r--tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json2
-rw-r--r--tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json2
-rw-r--r--tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json2
-rw-r--r--tools/perf/tests/vmlinux-kallsyms.c1
-rw-r--r--tools/perf/trace/beauty/include/linux/socket.h5
-rw-r--r--tools/perf/trace/beauty/include/uapi/linux/fs.h163
-rw-r--r--tools/perf/trace/beauty/include/uapi/linux/mount.h10
-rw-r--r--tools/perf/trace/beauty/include/uapi/linux/stat.h12
-rw-r--r--tools/perf/trace/beauty/include/uapi/sound/asound.h9
-rw-r--r--tools/perf/util/callchain.c2
-rw-r--r--tools/perf/util/dso.c2
-rw-r--r--tools/perf/util/dso.h5
-rw-r--r--tools/perf/util/unwind-libunwind-local.c2
-rw-r--r--tools/power/x86/turbostat/Makefile1
-rw-r--r--tools/power/x86/turbostat/turbostat.898
-rw-r--r--tools/power/x86/turbostat/turbostat.c2335
-rw-r--r--tools/testing/cxl/Kbuild1
-rw-r--r--tools/testing/cxl/test/mem.c69
-rw-r--r--tools/testing/cxl/test/mock.c12
-rw-r--r--tools/testing/nvdimm/test/iomap.c1
-rw-r--r--tools/testing/nvdimm/test/ndtest.c1
-rw-r--r--tools/testing/nvdimm/test/nfit.c1
-rw-r--r--tools/testing/radix-tree/Makefile4
-rw-r--r--tools/testing/radix-tree/bitmap.c23
-rw-r--r--tools/testing/radix-tree/idr-test.c1
-rw-r--r--tools/testing/radix-tree/maple.c1
-rw-r--r--tools/testing/radix-tree/xarray.c1
-rw-r--r--tools/testing/selftests/arm64/abi/ptrace.c2
-rw-r--r--tools/testing/selftests/bpf/DENYLIST.aarch641
-rw-r--r--tools/testing/selftests/bpf/Makefile2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fexit_sleep.c8
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_listen.c85
-rw-r--r--tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_metadata.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c4
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c4
-rw-r--r--tools/testing/selftests/bpf/progs/iters.c54
-rw-r--r--tools/testing/selftests/cgroup/config1
-rw-r--r--tools/testing/selftests/core/close_range_test.c35
-rw-r--r--tools/testing/selftests/damon/Makefile3
-rw-r--r--tools/testing/selftests/damon/_damon_sysfs.py65
-rw-r--r--tools/testing/selftests/damon/access_memory.c2
-rw-r--r--tools/testing/selftests/damon/access_memory_even.c42
-rw-r--r--tools/testing/selftests/damon/damon_nr_regions.py145
-rw-r--r--tools/testing/selftests/damon/damos_tried_regions.py65
-rw-r--r--tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c4
-rw-r--r--tools/testing/selftests/drivers/dma-buf/udmabuf.c214
-rwxr-xr-xtools/testing/selftests/drivers/net/hw/rss_ctx.py37
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh3
-rw-r--r--tools/testing/selftests/exec/Makefile1
-rw-r--r--tools/testing/selftests/filesystems/eventfd/eventfd_test.c136
-rw-r--r--tools/testing/selftests/futex/functional/Makefile2
-rw-r--r--tools/testing/selftests/hid/hid_bpf.c26
-rw-r--r--tools/testing/selftests/hid/progs/hid.c2
-rw-r--r--tools/testing/selftests/hid/progs/hid_bpf_helpers.h2
-rw-r--r--tools/testing/selftests/intel_pstate/Makefile2
-rw-r--r--tools/testing/selftests/iommu/Makefile2
-rw-r--r--tools/testing/selftests/kselftest/ksft.py2
-rw-r--r--tools/testing/selftests/kvm/Makefile4
-rw-r--r--tools/testing/selftests/kvm/aarch64/get-reg-list.c4
-rw-r--r--tools/testing/selftests/kvm/aarch64/set_id_regs.c17
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/apic.h8
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h18
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c9
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c11
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c6
-rw-r--r--tools/testing/selftests/kvm/pre_fault_memory_test.c146
-rw-r--r--tools/testing/selftests/kvm/riscv/get-reg-list.c8
-rw-r--r--tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c194
-rw-r--r--tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c22
-rw-r--r--tools/testing/selftests/kvm/x86_64/pmu_counters_test.c44
-rw-r--r--tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c35
-rw-r--r--tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c16
-rw-r--r--tools/testing/selftests/kvm/x86_64/xapic_state_test.c28
-rw-r--r--tools/testing/selftests/landlock/base_test.c74
-rw-r--r--tools/testing/selftests/landlock/config1
-rw-r--r--tools/testing/selftests/lib.mk3
-rwxr-xr-xtools/testing/selftests/livepatch/test-livepatch.sh138
-rwxr-xr-xtools/testing/selftests/livepatch/test-syscall.sh5
-rwxr-xr-xtools/testing/selftests/livepatch/test-sysfs.sh48
-rw-r--r--tools/testing/selftests/mm/.gitignore2
-rw-r--r--tools/testing/selftests/mm/Makefile8
-rw-r--r--tools/testing/selftests/mm/compaction_test.c5
-rw-r--r--tools/testing/selftests/mm/droppable.c53
-rw-r--r--tools/testing/selftests/mm/hugepage-mremap.c2
-rw-r--r--tools/testing/selftests/mm/hugetlb-soft-offline.c228
-rw-r--r--tools/testing/selftests/mm/hugetlb_dio.c117
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c8
-rw-r--r--tools/testing/selftests/mm/memfd_secret.c14
-rw-r--r--tools/testing/selftests/mm/mkdirty.c8
-rw-r--r--tools/testing/selftests/mm/mlock2.h1
-rw-r--r--tools/testing/selftests/mm/mremap_test.c2
-rw-r--r--tools/testing/selftests/mm/mseal_helpers.h41
-rw-r--r--tools/testing/selftests/mm/mseal_test.c143
-rw-r--r--tools/testing/selftests/mm/pagemap_ioctl.c6
-rw-r--r--tools/testing/selftests/mm/protection_keys.c2
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh10
-rw-r--r--tools/testing/selftests/mm/seal_elf.c37
-rw-r--r--tools/testing/selftests/mm/split_huge_page_test.c3
-rw-r--r--tools/testing/selftests/mm/thuge-gen.c15
-rw-r--r--tools/testing/selftests/mm/uffd-common.c4
-rw-r--r--tools/testing/selftests/mm/uffd-stress.c31
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c14
-rw-r--r--tools/testing/selftests/mm/va_high_addr_switch.c468
-rwxr-xr-xtools/testing/selftests/mm/va_high_addr_switch.sh4
-rw-r--r--tools/testing/selftests/mqueue/mq_perf_tests.c6
-rw-r--r--tools/testing/selftests/net/Makefile2
-rw-r--r--tools/testing/selftests/net/af_unix/msg_oob.c2
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh18
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_vlan_aware.sh54
-rw-r--r--tools/testing/selftests/net/forwarding/lib.sh57
-rwxr-xr-xtools/testing/selftests/net/forwarding/local_termination.sh431
-rw-r--r--tools/testing/selftests/net/lib.sh1
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_connect.c8
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh224
-rw-r--r--tools/testing/selftests/net/netfilter/Makefile1
-rwxr-xr-xtools/testing/selftests/net/netfilter/br_netfilter_queue.sh78
-rw-r--r--tools/testing/selftests/net/tcp_ao/Makefile2
-rwxr-xr-xtools/testing/selftests/net/udpgro.sh53
-rw-r--r--tools/testing/selftests/net/udpgso.c25
-rw-r--r--tools/testing/selftests/pidfd/pidfd_setns_test.c258
-rw-r--r--tools/testing/selftests/proc/.gitignore2
-rw-r--r--tools/testing/selftests/proc/Makefile4
-rw-r--r--tools/testing/selftests/proc/proc-2-is-kthread.c53
-rw-r--r--tools/testing/selftests/proc/proc-empty-vm.c3
-rw-r--r--tools/testing/selftests/proc/proc-pid-vm.c86
-rw-r--r--tools/testing/selftests/proc/proc-self-isnt-kthread.c37
-rw-r--r--tools/testing/selftests/resctrl/Makefile2
-rw-r--r--tools/testing/selftests/ring-buffer/Makefile1
-rw-r--r--tools/testing/selftests/riscv/mm/Makefile2
-rw-r--r--tools/testing/selftests/seccomp/seccomp_bpf.c2
-rw-r--r--tools/testing/selftests/sgx/Makefile2
-rwxr-xr-xtools/testing/selftests/tc-testing/tdc.py1
-rw-r--r--tools/testing/selftests/tmpfs/Makefile1
-rwxr-xr-xtools/testing/selftests/turbostat/added_perf_counters.py178
-rwxr-xr-xtools/testing/selftests/turbostat/smi_aperf_mperf.py157
-rw-r--r--tools/testing/selftests/vDSO/.gitignore2
-rw-r--r--tools/testing/selftests/vDSO/Makefile18
-rw-r--r--tools/testing/selftests/vDSO/vdso_test_chacha.c43
-rw-r--r--tools/testing/selftests/vDSO/vdso_test_getrandom.c288
-rw-r--r--tools/tracing/latency/Makefile.config3
-rw-r--r--tools/tracing/rtla/Makefile.config3
-rw-r--r--tools/tracing/rtla/src/osnoise_top.c11
-rw-r--r--tools/verification/rv/Makefile.config3
190 files changed, 8812 insertions, 1851 deletions
diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h
index 7b32b99023a2..5fd7caea4419 100644
--- a/tools/arch/arm64/include/asm/cputype.h
+++ b/tools/arch/arm64/include/asm/cputype.h
@@ -86,9 +86,14 @@
#define ARM_CPU_PART_CORTEX_X2 0xD48
#define ARM_CPU_PART_NEOVERSE_N2 0xD49
#define ARM_CPU_PART_CORTEX_A78C 0xD4B
+#define ARM_CPU_PART_CORTEX_X1C 0xD4C
+#define ARM_CPU_PART_CORTEX_X3 0xD4E
#define ARM_CPU_PART_NEOVERSE_V2 0xD4F
+#define ARM_CPU_PART_CORTEX_A720 0xD81
#define ARM_CPU_PART_CORTEX_X4 0xD82
#define ARM_CPU_PART_NEOVERSE_V3 0xD84
+#define ARM_CPU_PART_CORTEX_X925 0xD85
+#define ARM_CPU_PART_CORTEX_A725 0xD87
#define APM_CPU_PART_XGENE 0x000
#define APM_CPU_VAR_POTENZA 0x00
@@ -162,9 +167,14 @@
#define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2)
#define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2)
#define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C)
+#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C)
+#define MIDR_CORTEX_X3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X3)
#define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2)
+#define MIDR_CORTEX_A720 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A720)
#define MIDR_CORTEX_X4 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X4)
#define MIDR_NEOVERSE_V3 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V3)
+#define MIDR_CORTEX_X925 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X925)
+#define MIDR_CORTEX_A725 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A725)
#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 1691297a766a..eaeda001784e 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -645,6 +645,9 @@ struct kvm_ppc_cpu_char {
#define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
#define KVM_REG_PPC_DAWR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
#define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
+#define KVM_REG_PPC_DEXCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6)
+#define KVM_REG_PPC_HASHKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc7)
+#define KVM_REG_PPC_HASHPKEYR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc8)
/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 3c7434329661..dd4682857c12 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -18,170 +18,170 @@
/*
* Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name. If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
+ * in /proc/cpuinfo instead of the macro name. Otherwise, this feature
+ * bit is not displayed in /proc/cpuinfo at all.
*
* When adding new features here that depend on other features,
* please update the table in kernel/cpu/cpuid-deps.c as well.
*/
/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
-#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */
#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */
#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */
#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
+#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */
/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */
-#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */
+#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */
#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */
-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */
/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */
/* Other features, Linux-defined mapping, word 3 */
/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */
-#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */
-#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */
-#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
-#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
-#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
+#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
+#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */
+#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
+#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */
+#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */
+#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */
+#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */
+#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */
/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */
#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
-#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */
-#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */
-#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */
-#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */
+#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */
+#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */
+#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */
+#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */
+#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */
#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */
-#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */
-#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */
-#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */
-#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */
/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
-#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */
-#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */
-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */
-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */
-#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */
+#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */
+#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */
+#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@@ -189,93 +189,93 @@
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
-#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
-#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
-#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
-#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */
-#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */
-#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
-#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */
-#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */
-#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
-#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
-#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
-#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
-#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
-#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */
-#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
-#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
-#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
-#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
-#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */
-#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
-#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
-#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */
+#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */
+#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */
+#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */
+#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */
+#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */
+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */
+#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */
+#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
+#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */
+#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */
+#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */
+#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */
+#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */
/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */
-#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */
-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
-#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
-#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
-#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
-#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
-#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
-#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
-#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */
+#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */
+#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */
+#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */
+#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */
+#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */
+#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */
+#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
-#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */
-#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */
-#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
-#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */
-#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
-#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
-#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */
-#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */
-#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
-#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */
+#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */
+#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */
+#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */
+#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */
+#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */
/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */
-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
-#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -283,181 +283,183 @@
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */
-#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */
-#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */
-#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
-#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
-#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
-#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
-#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
-#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
-#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
-#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */
-#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */
-#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
-#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */
-#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */
-#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */
-#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */
-#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
-#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */
-#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */
-#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
-#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
-#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
-#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
-#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
-#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
-#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
-#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
-#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */
-#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */
-#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */
-#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */
+#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */
+#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */
+#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */
+#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */
+#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */
+#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */
+#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */
+#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */
+#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */
+#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */
+#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */
+#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */
+#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */
+#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */
+#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */
+#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */
+#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */
+#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */
+#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */
+#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */
+#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */
+#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */
+#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
-#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
-#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
-#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
-#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
-#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */
-#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */
-#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
-#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */
-#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
-#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */
-#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
-#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
-#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */
+#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */
+#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */
+#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */
+#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */
+#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */
+#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */
+#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
+#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
+#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
-#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
-#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
-#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */
-#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */
-#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
-#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
-#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */
-#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */
-#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
-#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
-#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
-#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
-#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
-#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
-#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
+#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
+#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
+#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
+#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */
+#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
-#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */
+#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */
+#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */
/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
-#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */
#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
-#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
-#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
-#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
-#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */
-#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */
+#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */
+#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
+#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
-#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
-#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
-#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
-#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
-#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
-#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */
-#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
-#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
-#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
-#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */
-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
-#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */
-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
-#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
-#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */
-#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
-#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */
-#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */
-#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */
-#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */
+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/
+#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */
+#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */
+#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */
+#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
+#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */
+#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
+#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
+#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */
+#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */
+#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
+#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
-#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
-#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
-#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
-#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
-#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
-#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */
-#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
-#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
-#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
-#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */
-#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
-#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
-#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */
-#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
-#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
-#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
-#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */
-#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */
-#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */
-#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */
-#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */
-#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
-#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
-#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
-#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
-#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */
+#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */
+#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */
+#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */
+#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */
+#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */
+#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */
+#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */
+#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */
+#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */
+#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */
+#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */
+#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */
+#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */
+#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */
+#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */
+#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */
+#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */
/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
-#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */
-#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
-#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
-#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
-#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */
-#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
-#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
-#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */
+#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */
+#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
+#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
+#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
-#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
-#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
-#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
-#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
-#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
-#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
+#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */
+#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
+#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
+#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
+#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
-#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */
-#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */
-#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */
+#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
+#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
+#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -465,59 +467,60 @@
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */
-#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */
-#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */
-#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
-#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */
+#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */
+#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */
+#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
+#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
/*
* BUG word(s)
*/
#define X86_BUG(x) (NCAPINTS*32 + (x))
-#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */
+#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */
+#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */
#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */
#ifdef CONFIG_X86_32
/*
* 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
* to avoid confusion.
*/
-#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */
#endif
-#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
-#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
-#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
-#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
-#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
-#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
-#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
-#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
-#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
-#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
-#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
-#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
-#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
-#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
-#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
-#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
-#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
-#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
-#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
-#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */
-#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */
+#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */
+#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */
+#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */
+#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */
+#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */
+#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */
+#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */
+#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */
+#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */
+#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */
/* BUG word 2 */
-#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
-#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
-#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
-#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */
+#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */
+#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */
+#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */
+#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index e022e6eb766c..82c6a4d350e0 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -566,6 +566,12 @@
#define MSR_RELOAD_PMC0 0x000014c1
#define MSR_RELOAD_FIXED_CTR0 0x00001309
+/* V6 PMON MSR range */
+#define MSR_IA32_PMC_V6_GP0_CTR 0x1900
+#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901
+#define MSR_IA32_PMC_V6_FX0_CTR 0x1980
+#define MSR_IA32_PMC_V6_STEP 4
+
/* KeyID partitioning between MKTME and TDX */
#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087
@@ -660,6 +666,8 @@
#define MSR_AMD64_RMP_BASE 0xc0010132
#define MSR_AMD64_RMP_END 0xc0010133
+#define MSR_SVSM_CAA 0xc001f000
+
/* AMD Collaborative Processor Performance Control MSRs */
#define MSR_AMD_CPPC_CAP1 0xc00102b0
#define MSR_AMD_CPPC_ENABLE 0xc00102b1
@@ -781,6 +789,8 @@
#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
+#define MSR_K7_HWCR_CPB_DIS_BIT 25
+#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT)
/* K6 MSRs */
#define MSR_K6_WHCR 0xc0000082
@@ -1164,6 +1174,7 @@
#define MSR_IA32_QM_CTR 0xc8e
#define MSR_IA32_PQR_ASSOC 0xc8f
#define MSR_IA32_L3_CBM_BASE 0xc90
+#define MSR_RMID_SNC_CONFIG 0xca0
#define MSR_IA32_L2_CBM_BASE 0xd10
#define MSR_IA32_MBA_THRTL_BASE 0xd50
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 9fae1b73b529..bf57a824f722 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,7 @@ struct kvm_ioapic_state {
#define KVM_RUN_X86_SMM (1 << 0)
#define KVM_RUN_X86_BUS_LOCK (1 << 1)
+#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
@@ -697,6 +698,11 @@ enum sev_cmd_id {
/* Second time is the charm; improved versions of the above ioctls. */
KVM_SEV_INIT2,
+ /* SNP-specific commands */
+ KVM_SEV_SNP_LAUNCH_START = 100,
+ KVM_SEV_SNP_LAUNCH_UPDATE,
+ KVM_SEV_SNP_LAUNCH_FINISH,
+
KVM_SEV_NR_MAX,
};
@@ -824,6 +830,48 @@ struct kvm_sev_receive_update_data {
__u32 pad2;
};
+struct kvm_sev_snp_launch_start {
+ __u64 policy;
+ __u8 gosvw[16];
+ __u16 flags;
+ __u8 pad0[6];
+ __u64 pad1[4];
+};
+
+/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
+#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
+#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
+#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5
+#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6
+
+struct kvm_sev_snp_launch_update {
+ __u64 gfn_start;
+ __u64 uaddr;
+ __u64 len;
+ __u8 type;
+ __u8 pad0;
+ __u16 flags;
+ __u32 pad1;
+ __u64 pad2[4];
+};
+
+#define KVM_SEV_SNP_ID_BLOCK_SIZE 96
+#define KVM_SEV_SNP_ID_AUTH_SIZE 4096
+#define KVM_SEV_SNP_FINISH_DATA_SIZE 32
+
+struct kvm_sev_snp_launch_finish {
+ __u64 id_block_uaddr;
+ __u64 id_auth_uaddr;
+ __u8 id_block_en;
+ __u8 auth_key_en;
+ __u8 vcek_disabled;
+ __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE];
+ __u8 pad0[3];
+ __u16 flags;
+ __u64 pad1[4];
+};
+
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
@@ -874,5 +922,6 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_SW_PROTECTED_VM 1
#define KVM_X86_SEV_VM 2
#define KVM_X86_SEV_ES_VM 3
+#define KVM_X86_SNP_VM 4
#endif /* _ASM_X86_KVM_H */
diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h
index 80e1df482337..1814b413fd57 100644
--- a/tools/arch/x86/include/uapi/asm/svm.h
+++ b/tools/arch/x86/include/uapi/asm/svm.h
@@ -115,6 +115,7 @@
#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
#define SVM_VMGEXIT_AP_CREATE 1
#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 40ea743d139f..2ff949ea82fa 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2489,7 +2489,7 @@ static int do_help(int argc, char **argv)
" cgroup/connect_unix | cgroup/getpeername4 | cgroup/getpeername6 |\n"
" cgroup/getpeername_unix | cgroup/getsockname4 | cgroup/getsockname6 |\n"
" cgroup/getsockname_unix | cgroup/sendmsg4 | cgroup/sendmsg6 |\n"
- " cgroup/sendmsg°unix | cgroup/recvmsg4 | cgroup/recvmsg6 | cgroup/recvmsg_unix |\n"
+ " cgroup/sendmsg_unix | cgroup/recvmsg4 | cgroup/recvmsg6 | cgroup/recvmsg_unix |\n"
" cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n"
" struct_ops | fentry | fexit | freplace | sk_lookup }\n"
" ATTACH_TYPE := { sk_msg_verdict | sk_skb_verdict | sk_skb_stream_verdict |\n"
diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 936ef95c3d32..d54aaa0619df 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -704,7 +704,7 @@ static int sets_patch(struct object *obj)
* Make sure id is at the beginning of the pairs
* struct, otherwise the below qsort would not work.
*/
- BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id);
+ BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id);
qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id);
/*
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 1e2ab148d5db..e1900abd44f6 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -149,6 +149,24 @@ FEATURE_DISPLAY ?= \
#
FEATURE_GROUP_MEMBERS-libbfd = libbfd-liberty libbfd-liberty-z
+#
+# Declare list of feature dependency packages that provide pkg-config files.
+#
+FEATURE_PKG_CONFIG ?= \
+ libtraceevent \
+ libtracefs
+
+feature_pkg_config = $(eval $(feature_pkg_config_code))
+define feature_pkg_config_code
+ FEATURE_CHECK_CFLAGS-$(1) := $(shell $(PKG_CONFIG) --cflags $(1) 2>/dev/null)
+ FEATURE_CHECK_LDFLAGS-$(1) := $(shell $(PKG_CONFIG) --libs $(1) 2>/dev/null)
+endef
+
+# Set FEATURE_CHECK_(C|LD)FLAGS-$(package) for packages using pkg-config.
+ifneq ($(PKG_CONFIG),)
+ $(foreach package,$(FEATURE_PKG_CONFIG),$(call feature_pkg_config,$(package)))
+endif
+
# Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
# If in the future we need per-feature checks/flags for features not
# mentioned in this list we need to refactor this ;-).
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 489cbed7e82a..12796808f07a 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -82,7 +82,30 @@ FILES= \
FILES := $(addprefix $(OUTPUT),$(FILES))
-PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config
+# Some distros provide the command $(CROSS_COMPILE)pkg-config for
+# searching packges installed with Multiarch. Use it for cross
+# compilation if it is existed.
+ifneq (, $(shell which $(CROSS_COMPILE)pkg-config))
+ PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config
+else
+ PKG_CONFIG ?= pkg-config
+
+ # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR
+ # for modified system root, are required for the cross compilation.
+ # If these PKG_CONFIG environment variables are not set, Multiarch library
+ # paths are used instead.
+ ifdef CROSS_COMPILE
+ ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),)
+ CROSS_ARCH = $(shell $(CC) -dumpmachine)
+ PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/
+ export PKG_CONFIG_LIBDIR
+ endif
+ endif
+endif
all: $(FILES)
@@ -147,7 +170,17 @@ $(OUTPUT)test-libopencsd.bin:
DWARFLIBS := -ldw
ifeq ($(findstring -static,${LDFLAGS}),-static)
-DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
+ DWARFLIBS += -lelf -lz -llzma -lbz2 -lzstd
+
+ LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw)
+ LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION)))
+ LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION)))
+
+ # Elfutils merged libebl.a into libdw.a starting from version 0.177,
+ # Link libebl.a only if libdw is older than this version.
+ ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0)
+ DWARFLIBS += -lebl
+ endif
endif
$(OUTPUT)test-dwarf.bin:
@@ -178,27 +211,27 @@ $(OUTPUT)test-numa_num_possible_cpus.bin:
$(BUILD) -lnuma
$(OUTPUT)test-libunwind.bin:
- $(BUILD) -lelf
+ $(BUILD) -lelf -llzma
$(OUTPUT)test-libunwind-debug-frame.bin:
- $(BUILD) -lelf
+ $(BUILD) -lelf -llzma
$(OUTPUT)test-libunwind-x86.bin:
- $(BUILD) -lelf -lunwind-x86
+ $(BUILD) -lelf -llzma -lunwind-x86
$(OUTPUT)test-libunwind-x86_64.bin:
- $(BUILD) -lelf -lunwind-x86_64
+ $(BUILD) -lelf -llzma -lunwind-x86_64
$(OUTPUT)test-libunwind-arm.bin:
- $(BUILD) -lelf -lunwind-arm
+ $(BUILD) -lelf -llzma -lunwind-arm
$(OUTPUT)test-libunwind-aarch64.bin:
- $(BUILD) -lelf -lunwind-aarch64
+ $(BUILD) -lelf -llzma -lunwind-aarch64
$(OUTPUT)test-libunwind-debug-frame-arm.bin:
- $(BUILD) -lelf -lunwind-arm
+ $(BUILD) -lelf -llzma -lunwind-arm
$(OUTPUT)test-libunwind-debug-frame-aarch64.bin:
- $(BUILD) -lelf -lunwind-aarch64
+ $(BUILD) -lelf -llzma -lunwind-aarch64
$(OUTPUT)test-libaudit.bin:
$(BUILD) -laudit
diff --git a/tools/include/asm/rwonce.h b/tools/include/asm/rwonce.h
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/tools/include/asm/rwonce.h
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 210c13b1b857..2a7f260ef9dc 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -19,7 +19,7 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits);
bool __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits);
-void bitmap_clear(unsigned long *map, unsigned int start, int len);
+void __bitmap_clear(unsigned long *map, unsigned int start, int len);
bool __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits);
@@ -150,4 +150,19 @@ static inline bool bitmap_intersects(const unsigned long *src1,
return __bitmap_intersects(src1, src2, nbits);
}
+static inline void bitmap_clear(unsigned long *map, unsigned int start,
+ unsigned int nbits)
+{
+ if (__builtin_constant_p(nbits) && nbits == 1)
+ __clear_bit(start, map);
+ else if (small_const_nbits(start + nbits))
+ *map &= ~GENMASK(start + nbits - 1, start);
+ else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
+ IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
+ __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
+ IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
+ memset((char *)map + start / 8, 0, nbits / 8);
+ else
+ __bitmap_clear(map, start, nbits);
+}
#endif /* _TOOLS_LINUX_BITMAP_H */
diff --git a/tools/include/uapi/README b/tools/include/uapi/README
new file mode 100644
index 000000000000..7147b1b2cb28
--- /dev/null
+++ b/tools/include/uapi/README
@@ -0,0 +1,73 @@
+Why we want a copy of kernel headers in tools?
+==============================================
+
+There used to be no copies, with tools/ code using kernel headers
+directly. From time to time tools/perf/ broke due to legitimate kernel
+hacking. At some point Linus complained about such direct usage. Then we
+adopted the current model.
+
+The way these headers are used in perf are not restricted to just
+including them to compile something.
+
+There are sometimes used in scripts that convert defines into string
+tables, etc, so some change may break one of these scripts, or new MSRs
+may use some different #define pattern, etc.
+
+E.g.:
+
+ $ ls -1 tools/perf/trace/beauty/*.sh | head -5
+ tools/perf/trace/beauty/arch_errno_names.sh
+ tools/perf/trace/beauty/drm_ioctl.sh
+ tools/perf/trace/beauty/fadvise.sh
+ tools/perf/trace/beauty/fsconfig.sh
+ tools/perf/trace/beauty/fsmount.sh
+ $
+ $ tools/perf/trace/beauty/fadvise.sh
+ static const char *fadvise_advices[] = {
+ [0] = "NORMAL",
+ [1] = "RANDOM",
+ [2] = "SEQUENTIAL",
+ [3] = "WILLNEED",
+ [4] = "DONTNEED",
+ [5] = "NOREUSE",
+ };
+ $
+
+The tools/perf/check-headers.sh script, part of the tools/ build
+process, points out changes in the original files.
+
+So its important not to touch the copies in tools/ when doing changes in
+the original kernel headers, that will be done later, when
+check-headers.sh inform about the change to the perf tools hackers.
+
+Another explanation from Ingo Molnar:
+It's better than all the alternatives we tried so far:
+
+ - Symbolic links and direct #includes: this was the original approach but
+ was pushed back on from the kernel side, when tooling modified the
+ headers and broke them accidentally for kernel builds.
+
+ - Duplicate self-defined ABI headers like glibc: double the maintenance
+ burden, double the chance for mistakes, plus there's no tech-driven
+ notification mechanism to look at new kernel side changes.
+
+What we are doing now is a third option:
+
+ - A software-enforced copy-on-write mechanism of kernel headers to
+ tooling, driven by non-fatal warnings on the tooling side build when
+ kernel headers get modified:
+
+ Warning: Kernel ABI header differences:
+ diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h
+ diff -u tools/include/uapi/linux/fs.h include/uapi/linux/fs.h
+ diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h
+ ...
+
+ The tooling policy is to always pick up the kernel side headers as-is,
+ and integate them into the tooling build. The warnings above serve as a
+ notification to tooling maintainers that there's changes on the kernel
+ side.
+
+We've been using this for many years now, and it might seem hacky, but
+works surprisingly well.
+
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index a00d53d02723..5bf6148cac2b 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -737,7 +737,7 @@ __SC_COMP(__NR_pselect6_time64, sys_pselect6, compat_sys_pselect6_time64)
#define __NR_ppoll_time64 414
__SC_COMP(__NR_ppoll_time64, sys_ppoll, compat_sys_ppoll_time64)
#define __NR_io_pgetevents_time64 416
-__SYSCALL(__NR_io_pgetevents_time64, sys_io_pgetevents)
+__SC_COMP(__NR_io_pgetevents_time64, sys_io_pgetevents, compat_sys_io_pgetevents_time64)
#define __NR_recvmmsg_time64 417
__SC_COMP(__NR_recvmmsg_time64, sys_recvmmsg, compat_sys_recvmmsg_time64)
#define __NR_mq_timedsend_time64 418
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h
index d4d86e566e07..535cb68fdb5c 100644
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -2163,6 +2163,15 @@ struct drm_i915_gem_context_param {
* supports this per context flag.
*/
#define I915_CONTEXT_PARAM_LOW_LATENCY 0xe
+
+/*
+ * I915_CONTEXT_PARAM_CONTEXT_IMAGE:
+ *
+ * Allows userspace to provide own context images.
+ *
+ * Note that this is a debug API not available on production kernel builds.
+ */
+#define I915_CONTEXT_PARAM_CONTEXT_IMAGE 0xf
/* Must be kept compact -- no holes and well documented */
/** @value: Context parameter value to be set or queried */
@@ -2564,6 +2573,24 @@ struct i915_context_param_engines {
struct i915_engine_class_instance engines[N__]; \
} __attribute__((packed)) name__
+struct i915_gem_context_param_context_image {
+ /** @engine: Engine class & instance to be configured. */
+ struct i915_engine_class_instance engine;
+
+ /** @flags: One of the supported flags or zero. */
+ __u32 flags;
+#define I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX (1u << 0)
+
+ /** @size: Size of the image blob pointed to by @image. */
+ __u32 size;
+
+ /** @mbz: Must be zero. */
+ __u32 mbz;
+
+ /** @image: Userspace memory containing the context image. */
+ __u64 image;
+} __attribute__((packed));
+
/**
* struct drm_i915_gem_context_create_ext_setparam - Context parameter
* to set or query during context creation.
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
new file mode 100644
index 000000000000..8a27bc5c7a7f
--- /dev/null
+++ b/tools/include/uapi/linux/fs.h
@@ -0,0 +1,552 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_FS_H
+#define _UAPI_LINUX_FS_H
+
+/*
+ * This file has definitions for some important file table structures
+ * and constants and structures used by various generic file system
+ * ioctl's. Please do not make any changes in this file before
+ * sending patches for review to linux-fsdevel@vger.kernel.org and
+ * linux-api@vger.kernel.org.
+ */
+
+#include <linux/limits.h>
+#include <linux/ioctl.h>
+#include <linux/types.h>
+#ifndef __KERNEL__
+#include <linux/fscrypt.h>
+#endif
+
+/* Use of MS_* flags within the kernel is restricted to core mount(2) code. */
+#if !defined(__KERNEL__)
+#include <linux/mount.h>
+#endif
+
+/*
+ * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
+ * the file limit at runtime and only root can increase the per-process
+ * nr_file rlimit, so it's safe to set up a ridiculously high absolute
+ * upper limit on files-per-process.
+ *
+ * Some programs (notably those using select()) may have to be
+ * recompiled to take full advantage of the new limits..
+ */
+
+/* Fixed constants first: */
+#undef NR_OPEN
+#define INR_OPEN_CUR 1024 /* Initial setting for nfile rlimits */
+#define INR_OPEN_MAX 4096 /* Hard limit for nfile rlimits */
+
+#define BLOCK_SIZE_BITS 10
+#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+
+#define SEEK_SET 0 /* seek relative to beginning of file */
+#define SEEK_CUR 1 /* seek relative to current file position */
+#define SEEK_END 2 /* seek relative to end of file */
+#define SEEK_DATA 3 /* seek to the next data */
+#define SEEK_HOLE 4 /* seek to the next hole */
+#define SEEK_MAX SEEK_HOLE
+
+#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */
+#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */
+#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */
+
+struct file_clone_range {
+ __s64 src_fd;
+ __u64 src_offset;
+ __u64 src_length;
+ __u64 dest_offset;
+};
+
+struct fstrim_range {
+ __u64 start;
+ __u64 len;
+ __u64 minlen;
+};
+
+/*
+ * We include a length field because some filesystems (vfat) have an identifier
+ * that we do want to expose as a UUID, but doesn't have the standard length.
+ *
+ * We use a fixed size buffer beacuse this interface will, by fiat, never
+ * support "UUIDs" longer than 16 bytes; we don't want to force all downstream
+ * users to have to deal with that.
+ */
+struct fsuuid2 {
+ __u8 len;
+ __u8 uuid[16];
+};
+
+struct fs_sysfs_path {
+ __u8 len;
+ __u8 name[128];
+};
+
+/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
+#define FILE_DEDUPE_RANGE_SAME 0
+#define FILE_DEDUPE_RANGE_DIFFERS 1
+
+/* from struct btrfs_ioctl_file_extent_same_info */
+struct file_dedupe_range_info {
+ __s64 dest_fd; /* in - destination file */
+ __u64 dest_offset; /* in - start of extent in destination */
+ __u64 bytes_deduped; /* out - total # of bytes we were able
+ * to dedupe from this file. */
+ /* status of this dedupe operation:
+ * < 0 for error
+ * == FILE_DEDUPE_RANGE_SAME if dedupe succeeds
+ * == FILE_DEDUPE_RANGE_DIFFERS if data differs
+ */
+ __s32 status; /* out - see above description */
+ __u32 reserved; /* must be zero */
+};
+
+/* from struct btrfs_ioctl_file_extent_same_args */
+struct file_dedupe_range {
+ __u64 src_offset; /* in - start of extent in source */
+ __u64 src_length; /* in - length of extent */
+ __u16 dest_count; /* in - total elements in info array */
+ __u16 reserved1; /* must be zero */
+ __u32 reserved2; /* must be zero */
+ struct file_dedupe_range_info info[];
+};
+
+/* And dynamically-tunable limits and defaults: */
+struct files_stat_struct {
+ unsigned long nr_files; /* read only */
+ unsigned long nr_free_files; /* read only */
+ unsigned long max_files; /* tunable */
+};
+
+struct inodes_stat_t {
+ long nr_inodes;
+ long nr_unused;
+ long dummy[5]; /* padding for sysctl ABI compatibility */
+};
+
+
+#define NR_FILE 8192 /* this can well be larger on a larger system */
+
+/*
+ * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+ __u32 fsx_xflags; /* xflags field value (get/set) */
+ __u32 fsx_extsize; /* extsize field value (get/set)*/
+ __u32 fsx_nextents; /* nextents field value (get) */
+ __u32 fsx_projid; /* project identifier (get/set) */
+ __u32 fsx_cowextsize; /* CoW extsize field value (get/set)*/
+ unsigned char fsx_pad[8];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
+#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
+#define FS_XFLAG_APPEND 0x00000010 /* all writes append */
+#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
+#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */
+#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
+#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
+#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
+#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */
+#define FS_XFLAG_COWEXTSIZE 0x00010000 /* CoW extent size allocator hint */
+#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
+
+/* the read-only stuff doesn't really belong here, but any other place is
+ probably as bad and I don't want to create yet another include file. */
+
+#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
+#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
+#define BLKRRPART _IO(0x12,95) /* re-read partition table */
+#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
+#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
+#define BLKRASET _IO(0x12,98) /* set read ahead for block device */
+#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
+#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
+#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
+#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
+#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
+#define BLKSSZGET _IO(0x12,104)/* get block device sector size */
+#if 0
+#define BLKPG _IO(0x12,105)/* See blkpg.h */
+
+/* Some people are morons. Do not use sizeof! */
+
+#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */
+#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */
+/* This was here just to show that the number is taken -
+ probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
+#endif
+/* A jump here: 108-111 have been used for various private purposes. */
+#define BLKBSZGET _IOR(0x12,112,size_t)
+#define BLKBSZSET _IOW(0x12,113,size_t)
+#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
+#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
+#define BLKTRACESTART _IO(0x12,116)
+#define BLKTRACESTOP _IO(0x12,117)
+#define BLKTRACETEARDOWN _IO(0x12,118)
+#define BLKDISCARD _IO(0x12,119)
+#define BLKIOMIN _IO(0x12,120)
+#define BLKIOOPT _IO(0x12,121)
+#define BLKALIGNOFF _IO(0x12,122)
+#define BLKPBSZGET _IO(0x12,123)
+#define BLKDISCARDZEROES _IO(0x12,124)
+#define BLKSECDISCARD _IO(0x12,125)
+#define BLKROTATIONAL _IO(0x12,126)
+#define BLKZEROOUT _IO(0x12,127)
+#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
+/*
+ * A jump here: 130-136 are reserved for zoned block devices
+ * (see uapi/linux/blkzoned.h)
+ */
+
+#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
+#define FIBMAP _IO(0x00,1) /* bmap access */
+#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
+#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
+#define FITHAW _IOWR('X', 120, int) /* Thaw */
+#define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */
+#define FICLONE _IOW(0x94, 9, int)
+#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
+#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range)
+
+#define FSLABEL_MAX 256 /* Max chars for the interface; each fs may differ */
+
+#define FS_IOC_GETFLAGS _IOR('f', 1, long)
+#define FS_IOC_SETFLAGS _IOW('f', 2, long)
+#define FS_IOC_GETVERSION _IOR('v', 1, long)
+#define FS_IOC_SETVERSION _IOW('v', 2, long)
+#define FS_IOC_FIEMAP _IOWR('f', 11, struct fiemap)
+#define FS_IOC32_GETFLAGS _IOR('f', 1, int)
+#define FS_IOC32_SETFLAGS _IOW('f', 2, int)
+#define FS_IOC32_GETVERSION _IOR('v', 1, int)
+#define FS_IOC32_SETVERSION _IOW('v', 2, int)
+#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr)
+#define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX])
+#define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX])
+/* Returns the external filesystem UUID, the same one blkid returns */
+#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2)
+/*
+ * Returns the path component under /sys/fs/ that refers to this filesystem;
+ * also /sys/kernel/debug/ for filesystems with debugfs exports
+ */
+#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path)
+
+/*
+ * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
+ *
+ * Note: for historical reasons, these flags were originally used and
+ * defined for use by ext2/ext3, and then other file systems started
+ * using these flags so they wouldn't need to write their own version
+ * of chattr/lsattr (which was shipped as part of e2fsprogs). You
+ * should think twice before trying to use these flags in new
+ * contexts, or trying to assign these flags, since they are used both
+ * as the UAPI and the on-disk encoding for ext2/3/4. Also, we are
+ * almost out of 32-bit flags. :-)
+ *
+ * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from
+ * XFS to the generic FS level interface. This uses a structure that
+ * has padding and hence has more room to grow, so it may be more
+ * appropriate for many new use cases.
+ *
+ * Please do not change these flags or interfaces before checking with
+ * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org.
+ */
+#define FS_SECRM_FL 0x00000001 /* Secure deletion */
+#define FS_UNRM_FL 0x00000002 /* Undelete */
+#define FS_COMPR_FL 0x00000004 /* Compress file */
+#define FS_SYNC_FL 0x00000008 /* Synchronous updates */
+#define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define FS_APPEND_FL 0x00000020 /* writes to file may only append */
+#define FS_NODUMP_FL 0x00000040 /* do not dump file */
+#define FS_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define FS_DIRTY_FL 0x00000100
+#define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define FS_NOCOMP_FL 0x00000400 /* Don't compress */
+/* End compression flags --- maybe not all used */
+#define FS_ENCRYPT_FL 0x00000800 /* Encrypted file */
+#define FS_BTREE_FL 0x00001000 /* btree format dir */
+#define FS_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define FS_IMAGIC_FL 0x00002000 /* AFS directory */
+#define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */
+#define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */
+#define FS_EXTENT_FL 0x00080000 /* Extents */
+#define FS_VERITY_FL 0x00100000 /* Verity protected inode */
+#define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */
+#define FS_NOCOW_FL 0x00800000 /* Do not cow file */
+#define FS_DAX_FL 0x02000000 /* Inode is DAX */
+#define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */
+#define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
+#define FS_CASEFOLD_FL 0x40000000 /* Folder is case insensitive */
+#define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */
+
+#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
+#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+
+
+#define SYNC_FILE_RANGE_WAIT_BEFORE 1
+#define SYNC_FILE_RANGE_WRITE 2
+#define SYNC_FILE_RANGE_WAIT_AFTER 4
+#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \
+ SYNC_FILE_RANGE_WAIT_BEFORE | \
+ SYNC_FILE_RANGE_WAIT_AFTER)
+
+/*
+ * Flags for preadv2/pwritev2:
+ */
+
+typedef int __bitwise __kernel_rwf_t;
+
+/* high priority request, poll if possible */
+#define RWF_HIPRI ((__force __kernel_rwf_t)0x00000001)
+
+/* per-IO O_DSYNC */
+#define RWF_DSYNC ((__force __kernel_rwf_t)0x00000002)
+
+/* per-IO O_SYNC */
+#define RWF_SYNC ((__force __kernel_rwf_t)0x00000004)
+
+/* per-IO, return -EAGAIN if operation would block */
+#define RWF_NOWAIT ((__force __kernel_rwf_t)0x00000008)
+
+/* per-IO O_APPEND */
+#define RWF_APPEND ((__force __kernel_rwf_t)0x00000010)
+
+/* per-IO negation of O_APPEND */
+#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020)
+
+/* mask of flags supported by the kernel */
+#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
+ RWF_APPEND | RWF_NOAPPEND)
+
+#define PROCFS_IOCTL_MAGIC 'f'
+
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg)
+
+/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
+#define PAGE_IS_WPALLOWED (1 << 0)
+#define PAGE_IS_WRITTEN (1 << 1)
+#define PAGE_IS_FILE (1 << 2)
+#define PAGE_IS_PRESENT (1 << 3)
+#define PAGE_IS_SWAPPED (1 << 4)
+#define PAGE_IS_PFNZERO (1 << 5)
+#define PAGE_IS_HUGE (1 << 6)
+#define PAGE_IS_SOFT_DIRTY (1 << 7)
+
+/*
+ * struct page_region - Page region with flags
+ * @start: Start of the region
+ * @end: End of the region (exclusive)
+ * @categories: PAGE_IS_* category bitmask for the region
+ */
+struct page_region {
+ __u64 start;
+ __u64 end;
+ __u64 categories;
+};
+
+/* Flags for PAGEMAP_SCAN ioctl */
+#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */
+#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */
+
+/*
+ * struct pm_scan_arg - Pagemap ioctl argument
+ * @size: Size of the structure
+ * @flags: Flags for the IOCTL
+ * @start: Starting address of the region
+ * @end: Ending address of the region
+ * @walk_end Address where the scan stopped (written by kernel).
+ * walk_end == end (address tags cleared) informs that the scan completed on entire range.
+ * @vec: Address of page_region struct array for output
+ * @vec_len: Length of the page_region struct array
+ * @max_pages: Optional limit for number of returned pages (0 = disabled)
+ * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
+ * @category_mask: Skip pages for which any category doesn't match
+ * @category_anyof_mask: Skip pages for which no category matches
+ * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned
+ */
+struct pm_scan_arg {
+ __u64 size;
+ __u64 flags;
+ __u64 start;
+ __u64 end;
+ __u64 walk_end;
+ __u64 vec;
+ __u64 vec_len;
+ __u64 max_pages;
+ __u64 category_inverted;
+ __u64 category_mask;
+ __u64 category_anyof_mask;
+ __u64 return_mask;
+};
+
+/* /proc/<pid>/maps ioctl */
+#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query)
+
+enum procmap_query_flags {
+ /*
+ * VMA permission flags.
+ *
+ * Can be used as part of procmap_query.query_flags field to look up
+ * only VMAs satisfying specified subset of permissions. E.g., specifying
+ * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs,
+ * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only
+ * return read/write VMAs, though both executable/non-executable and
+ * private/shared will be ignored.
+ *
+ * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags
+ * field to specify actual VMA permissions.
+ */
+ PROCMAP_QUERY_VMA_READABLE = 0x01,
+ PROCMAP_QUERY_VMA_WRITABLE = 0x02,
+ PROCMAP_QUERY_VMA_EXECUTABLE = 0x04,
+ PROCMAP_QUERY_VMA_SHARED = 0x08,
+ /*
+ * Query modifier flags.
+ *
+ * By default VMA that covers provided address is returned, or -ENOENT
+ * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest
+ * VMA with vma_start > addr will be returned if no covering VMA is
+ * found.
+ *
+ * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that
+ * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
+ * to iterate all VMAs with file backing.
+ */
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10,
+ PROCMAP_QUERY_FILE_BACKED_VMA = 0x20,
+};
+
+/*
+ * Input/output argument structured passed into ioctl() call. It can be used
+ * to query a set of VMAs (Virtual Memory Areas) of a process.
+ *
+ * Each field can be one of three kinds, marked in a short comment to the
+ * right of the field:
+ * - "in", input argument, user has to provide this value, kernel doesn't modify it;
+ * - "out", output argument, kernel sets this field with VMA data;
+ * - "in/out", input and output argument; user provides initial value (used
+ * to specify maximum allowable buffer size), and kernel sets it to actual
+ * amount of data written (or zero, if there is no data).
+ *
+ * If matching VMA is found (according to criterias specified by
+ * query_addr/query_flags, all the out fields are filled out, and ioctl()
+ * returns 0. If there is no matching VMA, -ENOENT will be returned.
+ * In case of any other error, negative error code other than -ENOENT is
+ * returned.
+ *
+ * Most of the data is similar to the one returned as text in /proc/<pid>/maps
+ * file, but procmap_query provides more querying flexibility. There are no
+ * consistency guarantees between subsequent ioctl() calls, but data returned
+ * for matched VMA is self-consistent.
+ */
+struct procmap_query {
+ /* Query struct size, for backwards/forward compatibility */
+ __u64 size;
+ /*
+ * Query flags, a combination of enum procmap_query_flags values.
+ * Defines query filtering and behavior, see enum procmap_query_flags.
+ *
+ * Input argument, provided by user. Kernel doesn't modify it.
+ */
+ __u64 query_flags; /* in */
+ /*
+ * Query address. By default, VMA that covers this address will
+ * be looked up. PROCMAP_QUERY_* flags above modify this default
+ * behavior further.
+ *
+ * Input argument, provided by user. Kernel doesn't modify it.
+ */
+ __u64 query_addr; /* in */
+ /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */
+ __u64 vma_start; /* out */
+ __u64 vma_end; /* out */
+ /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */
+ __u64 vma_flags; /* out */
+ /* VMA backing page size granularity. */
+ __u64 vma_page_size; /* out */
+ /*
+ * VMA file offset. If VMA has file backing, this specifies offset
+ * within the file that VMA's start address corresponds to.
+ * Is set to zero if VMA has no backing file.
+ */
+ __u64 vma_offset; /* out */
+ /* Backing file's inode number, or zero, if VMA has no backing file. */
+ __u64 inode; /* out */
+ /* Backing file's device major/minor number, or zero, if VMA has no backing file. */
+ __u32 dev_major; /* out */
+ __u32 dev_minor; /* out */
+ /*
+ * If set to non-zero value, signals the request to return VMA name
+ * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix
+ * appended, if file was unlinked from FS) for matched VMA. VMA name
+ * can also be some special name (e.g., "[heap]", "[stack]") or could
+ * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME).
+ *
+ * Kernel will set this field to zero, if VMA has no associated name.
+ * Otherwise kernel will return actual amount of bytes filled in
+ * user-supplied buffer (see vma_name_addr field below), including the
+ * terminating zero.
+ *
+ * If VMA name is longer that user-supplied maximum buffer size,
+ * -E2BIG error is returned.
+ *
+ * If this field is set to non-zero value, vma_name_addr should point
+ * to valid user space memory buffer of at least vma_name_size bytes.
+ * If set to zero, vma_name_addr should be set to zero as well
+ */
+ __u32 vma_name_size; /* in/out */
+ /*
+ * If set to non-zero value, signals the request to extract and return
+ * VMA's backing file's build ID, if the backing file is an ELF file
+ * and it contains embedded build ID.
+ *
+ * Kernel will set this field to zero, if VMA has no backing file,
+ * backing file is not an ELF file, or ELF file has no build ID
+ * embedded.
+ *
+ * Build ID is a binary value (not a string). Kernel will set
+ * build_id_size field to exact number of bytes used for build ID.
+ * If build ID is requested and present, but needs more bytes than
+ * user-supplied maximum buffer size (see build_id_addr field below),
+ * -E2BIG error will be returned.
+ *
+ * If this field is set to non-zero value, build_id_addr should point
+ * to valid user space memory buffer of at least build_id_size bytes.
+ * If set to zero, build_id_addr should be set to zero as well
+ */
+ __u32 build_id_size; /* in/out */
+ /*
+ * User-supplied address of a buffer of at least vma_name_size bytes
+ * for kernel to fill with matched VMA's name (see vma_name_size field
+ * description above for details).
+ *
+ * Should be set to zero if VMA name should not be returned.
+ */
+ __u64 vma_name_addr; /* in */
+ /*
+ * User-supplied address of a buffer of at least build_id_size bytes
+ * for kernel to fill with matched VMA's ELF build ID, if available
+ * (see build_id_size field description above for details).
+ *
+ * Should be set to zero if build ID should not be returned.
+ */
+ __u64 build_id_addr; /* in */
+};
+
+#endif /* _UAPI_LINUX_FS_H */
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 638c606dfa74..2f082b01ff22 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -41,6 +41,10 @@
*/
#define XDP_UMEM_TX_SW_CSUM (1 << 1)
+/* Request to reserve tx_metadata_len bytes of per-chunk metadata.
+ */
+#define XDP_UMEM_TX_METADATA_LEN (1 << 2)
+
struct sockaddr_xdp {
__u16 sxdp_family;
__u16 sxdp_flags;
diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h
index e682ab628dfa..d358add1611c 100644
--- a/tools/include/uapi/linux/in.h
+++ b/tools/include/uapi/linux/in.h
@@ -81,6 +81,8 @@ enum {
#define IPPROTO_ETHERNET IPPROTO_ETHERNET
IPPROTO_RAW = 255, /* Raw IP packets */
#define IPPROTO_RAW IPPROTO_RAW
+ IPPROTO_SMC = 256, /* Shared Memory Communications */
+#define IPPROTO_SMC IPPROTO_SMC
IPPROTO_MPTCP = 262, /* Multipath TCP connection */
#define IPPROTO_MPTCP IPPROTO_MPTCP
IPPROTO_MAX
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index d03842abae57..637efc055145 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -192,11 +192,24 @@ struct kvm_xen_exit {
/* Flags that describe what fields in emulation_failure hold valid data. */
#define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0)
+/*
+ * struct kvm_run can be modified by userspace at any time, so KVM must be
+ * careful to avoid TOCTOU bugs. In order to protect KVM, HINT_UNSAFE_IN_KVM()
+ * renames fields in struct kvm_run from <symbol> to <symbol>__unsafe when
+ * compiled into the kernel, ensuring that any use within KVM is obvious and
+ * gets extra scrutiny.
+ */
+#ifdef __KERNEL__
+#define HINT_UNSAFE_IN_KVM(_symbol) _symbol##__unsafe
+#else
+#define HINT_UNSAFE_IN_KVM(_symbol) _symbol
+#endif
+
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
/* in */
__u8 request_interrupt_window;
- __u8 immediate_exit;
+ __u8 HINT_UNSAFE_IN_KVM(immediate_exit);
__u8 padding1[6];
/* out */
@@ -917,6 +930,9 @@ struct kvm_enable_cap {
#define KVM_CAP_MEMORY_ATTRIBUTES 233
#define KVM_CAP_GUEST_MEMFD 234
#define KVM_CAP_VM_TYPES 235
+#define KVM_CAP_PRE_FAULT_MEMORY 236
+#define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237
+#define KVM_CAP_X86_GUEST_MODE 238
struct kvm_irq_routing_irqchip {
__u32 irqchip;
@@ -1548,4 +1564,13 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
+#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
+
+struct kvm_pre_fault_memory {
+ __u64 gpa;
+ __u64 size;
+ __u64 flags;
+ __u64 padding[5];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h
index a246e11988d5..e89d00528f2f 100644
--- a/tools/include/uapi/linux/mman.h
+++ b/tools/include/uapi/linux/mman.h
@@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
+#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
/*
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 3a64499b0f5d..4842c36fdf80 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -1349,12 +1349,14 @@ union perf_mem_data_src {
#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */
#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */
#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */
-/* 5-0x7 available */
+#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */
+#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */
+/* 0x7 available */
#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */
#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */
#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */
#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
-#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */
+#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */
#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */
#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */
#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
new file mode 100644
index 000000000000..35791791a879
--- /dev/null
+++ b/tools/include/uapi/linux/prctl.h
@@ -0,0 +1,331 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_PRCTL_H
+#define _LINUX_PRCTL_H
+
+#include <linux/types.h>
+
+/* Values to pass as first argument to prctl() */
+
+#define PR_SET_PDEATHSIG 1 /* Second arg is a signal */
+#define PR_GET_PDEATHSIG 2 /* Second arg is a ptr to return the signal */
+
+/* Get/set current->mm->dumpable */
+#define PR_GET_DUMPABLE 3
+#define PR_SET_DUMPABLE 4
+
+/* Get/set unaligned access control bits (if meaningful) */
+#define PR_GET_UNALIGN 5
+#define PR_SET_UNALIGN 6
+# define PR_UNALIGN_NOPRINT 1 /* silently fix up unaligned user accesses */
+# define PR_UNALIGN_SIGBUS 2 /* generate SIGBUS on unaligned user access */
+
+/* Get/set whether or not to drop capabilities on setuid() away from
+ * uid 0 (as per security/commoncap.c) */
+#define PR_GET_KEEPCAPS 7
+#define PR_SET_KEEPCAPS 8
+
+/* Get/set floating-point emulation control bits (if meaningful) */
+#define PR_GET_FPEMU 9
+#define PR_SET_FPEMU 10
+# define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */
+# define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */
+
+/* Get/set floating-point exception mode (if meaningful) */
+#define PR_GET_FPEXC 11
+#define PR_SET_FPEXC 12
+# define PR_FP_EXC_SW_ENABLE 0x80 /* Use FPEXC for FP exception enables */
+# define PR_FP_EXC_DIV 0x010000 /* floating point divide by zero */
+# define PR_FP_EXC_OVF 0x020000 /* floating point overflow */
+# define PR_FP_EXC_UND 0x040000 /* floating point underflow */
+# define PR_FP_EXC_RES 0x080000 /* floating point inexact result */
+# define PR_FP_EXC_INV 0x100000 /* floating point invalid operation */
+# define PR_FP_EXC_DISABLED 0 /* FP exceptions disabled */
+# define PR_FP_EXC_NONRECOV 1 /* async non-recoverable exc. mode */
+# define PR_FP_EXC_ASYNC 2 /* async recoverable exception mode */
+# define PR_FP_EXC_PRECISE 3 /* precise exception mode */
+
+/* Get/set whether we use statistical process timing or accurate timestamp
+ * based process timing */
+#define PR_GET_TIMING 13
+#define PR_SET_TIMING 14
+# define PR_TIMING_STATISTICAL 0 /* Normal, traditional,
+ statistical process timing */
+# define PR_TIMING_TIMESTAMP 1 /* Accurate timestamp based
+ process timing */
+
+#define PR_SET_NAME 15 /* Set process name */
+#define PR_GET_NAME 16 /* Get process name */
+
+/* Get/set process endian */
+#define PR_GET_ENDIAN 19
+#define PR_SET_ENDIAN 20
+# define PR_ENDIAN_BIG 0
+# define PR_ENDIAN_LITTLE 1 /* True little endian mode */
+# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */
+
+/* Get/set process seccomp mode */
+#define PR_GET_SECCOMP 21
+#define PR_SET_SECCOMP 22
+
+/* Get/set the capability bounding set (as per security/commoncap.c) */
+#define PR_CAPBSET_READ 23
+#define PR_CAPBSET_DROP 24
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+
+/* Get/set securebits (as per security/commoncap.c) */
+#define PR_GET_SECUREBITS 27
+#define PR_SET_SECUREBITS 28
+
+/*
+ * Get/set the timerslack as used by poll/select/nanosleep
+ * A value of 0 means "use default"
+ */
+#define PR_SET_TIMERSLACK 29
+#define PR_GET_TIMERSLACK 30
+
+#define PR_TASK_PERF_EVENTS_DISABLE 31
+#define PR_TASK_PERF_EVENTS_ENABLE 32
+
+/*
+ * Set early/late kill mode for hwpoison memory corruption.
+ * This influences when the process gets killed on a memory corruption.
+ */
+#define PR_MCE_KILL 33
+# define PR_MCE_KILL_CLEAR 0
+# define PR_MCE_KILL_SET 1
+
+# define PR_MCE_KILL_LATE 0
+# define PR_MCE_KILL_EARLY 1
+# define PR_MCE_KILL_DEFAULT 2
+
+#define PR_MCE_KILL_GET 34
+
+/*
+ * Tune up process memory map specifics.
+ */
+#define PR_SET_MM 35
+# define PR_SET_MM_START_CODE 1
+# define PR_SET_MM_END_CODE 2
+# define PR_SET_MM_START_DATA 3
+# define PR_SET_MM_END_DATA 4
+# define PR_SET_MM_START_STACK 5
+# define PR_SET_MM_START_BRK 6
+# define PR_SET_MM_BRK 7
+# define PR_SET_MM_ARG_START 8
+# define PR_SET_MM_ARG_END 9
+# define PR_SET_MM_ENV_START 10
+# define PR_SET_MM_ENV_END 11
+# define PR_SET_MM_AUXV 12
+# define PR_SET_MM_EXE_FILE 13
+# define PR_SET_MM_MAP 14
+# define PR_SET_MM_MAP_SIZE 15
+
+/*
+ * This structure provides new memory descriptor
+ * map which mostly modifies /proc/pid/stat[m]
+ * output for a task. This mostly done in a
+ * sake of checkpoint/restore functionality.
+ */
+struct prctl_mm_map {
+ __u64 start_code; /* code section bounds */
+ __u64 end_code;
+ __u64 start_data; /* data section bounds */
+ __u64 end_data;
+ __u64 start_brk; /* heap for brk() syscall */
+ __u64 brk;
+ __u64 start_stack; /* stack starts at */
+ __u64 arg_start; /* command line arguments bounds */
+ __u64 arg_end;
+ __u64 env_start; /* environment variables bounds */
+ __u64 env_end;
+ __u64 *auxv; /* auxiliary vector */
+ __u32 auxv_size; /* vector size */
+ __u32 exe_fd; /* /proc/$pid/exe link file */
+};
+
+/*
+ * Set specific pid that is allowed to ptrace the current task.
+ * A value of 0 mean "no process".
+ */
+#define PR_SET_PTRACER 0x59616d61
+# define PR_SET_PTRACER_ANY ((unsigned long)-1)
+
+#define PR_SET_CHILD_SUBREAPER 36
+#define PR_GET_CHILD_SUBREAPER 37
+
+/*
+ * If no_new_privs is set, then operations that grant new privileges (i.e.
+ * execve) will either fail or not grant them. This affects suid/sgid,
+ * file capabilities, and LSMs.
+ *
+ * Operations that merely manipulate or drop existing privileges (setresuid,
+ * capset, etc.) will still work. Drop those privileges if you want them gone.
+ *
+ * Changing LSM security domain is considered a new privilege. So, for example,
+ * asking selinux for a specific new context (e.g. with runcon) will result
+ * in execve returning -EPERM.
+ *
+ * See Documentation/userspace-api/no_new_privs.rst for more details.
+ */
+#define PR_SET_NO_NEW_PRIVS 38
+#define PR_GET_NO_NEW_PRIVS 39
+
+#define PR_GET_TID_ADDRESS 40
+
+#define PR_SET_THP_DISABLE 41
+#define PR_GET_THP_DISABLE 42
+
+/*
+ * No longer implemented, but left here to ensure the numbers stay reserved:
+ */
+#define PR_MPX_ENABLE_MANAGEMENT 43
+#define PR_MPX_DISABLE_MANAGEMENT 44
+
+#define PR_SET_FP_MODE 45
+#define PR_GET_FP_MODE 46
+# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */
+# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */
+
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+
+/* arm64 Scalable Vector Extension controls */
+/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
+#define PR_SVE_SET_VL 50 /* set task vector length */
+# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */
+#define PR_SVE_GET_VL 51 /* get task vector length */
+/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
+# define PR_SVE_VL_LEN_MASK 0xffff
+# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
+
+/* Per task speculation control */
+#define PR_GET_SPECULATION_CTRL 52
+#define PR_SET_SPECULATION_CTRL 53
+/* Speculation control variants */
+# define PR_SPEC_STORE_BYPASS 0
+# define PR_SPEC_INDIRECT_BRANCH 1
+# define PR_SPEC_L1D_FLUSH 2
+/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
+# define PR_SPEC_NOT_AFFECTED 0
+# define PR_SPEC_PRCTL (1UL << 0)
+# define PR_SPEC_ENABLE (1UL << 1)
+# define PR_SPEC_DISABLE (1UL << 2)
+# define PR_SPEC_FORCE_DISABLE (1UL << 3)
+# define PR_SPEC_DISABLE_NOEXEC (1UL << 4)
+
+/* Reset arm64 pointer authentication keys */
+#define PR_PAC_RESET_KEYS 54
+# define PR_PAC_APIAKEY (1UL << 0)
+# define PR_PAC_APIBKEY (1UL << 1)
+# define PR_PAC_APDAKEY (1UL << 2)
+# define PR_PAC_APDBKEY (1UL << 3)
+# define PR_PAC_APGAKEY (1UL << 4)
+
+/* Tagged user address controls for arm64 */
+#define PR_SET_TAGGED_ADDR_CTRL 55
+#define PR_GET_TAGGED_ADDR_CTRL 56
+# define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+/* MTE tag check fault modes */
+# define PR_MTE_TCF_NONE 0UL
+# define PR_MTE_TCF_SYNC (1UL << 1)
+# define PR_MTE_TCF_ASYNC (1UL << 2)
+# define PR_MTE_TCF_MASK (PR_MTE_TCF_SYNC | PR_MTE_TCF_ASYNC)
+/* MTE tag inclusion mask */
+# define PR_MTE_TAG_SHIFT 3
+# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT)
+/* Unused; kept only for source compatibility */
+# define PR_MTE_TCF_SHIFT 1
+
+/* Control reclaim behavior when allocating memory */
+#define PR_SET_IO_FLUSHER 57
+#define PR_GET_IO_FLUSHER 58
+
+/* Dispatch syscalls to a userspace handler */
+#define PR_SET_SYSCALL_USER_DISPATCH 59
+# define PR_SYS_DISPATCH_OFF 0
+# define PR_SYS_DISPATCH_ON 1
+/* The control values for the user space selector when dispatch is enabled */
+# define SYSCALL_DISPATCH_FILTER_ALLOW 0
+# define SYSCALL_DISPATCH_FILTER_BLOCK 1
+
+/* Set/get enabled arm64 pointer authentication keys */
+#define PR_PAC_SET_ENABLED_KEYS 60
+#define PR_PAC_GET_ENABLED_KEYS 61
+
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE 62
+# define PR_SCHED_CORE_GET 0
+# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX 4
+# define PR_SCHED_CORE_SCOPE_THREAD 0
+# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1
+# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2
+
+/* arm64 Scalable Matrix Extension controls */
+/* Flag values must be in sync with SVE versions */
+#define PR_SME_SET_VL 63 /* set task vector length */
+# define PR_SME_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */
+#define PR_SME_GET_VL 64 /* get task vector length */
+/* Bits common to PR_SME_SET_VL and PR_SME_GET_VL */
+# define PR_SME_VL_LEN_MASK 0xffff
+# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */
+
+/* Memory deny write / execute */
+#define PR_SET_MDWE 65
+# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0)
+# define PR_MDWE_NO_INHERIT (1UL << 1)
+
+#define PR_GET_MDWE 66
+
+#define PR_SET_VMA 0x53564d41
+# define PR_SET_VMA_ANON_NAME 0
+
+#define PR_GET_AUXV 0x41555856
+
+#define PR_SET_MEMORY_MERGE 67
+#define PR_GET_MEMORY_MERGE 68
+
+#define PR_RISCV_V_SET_CONTROL 69
+#define PR_RISCV_V_GET_CONTROL 70
+# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0
+# define PR_RISCV_V_VSTATE_CTRL_OFF 1
+# define PR_RISCV_V_VSTATE_CTRL_ON 2
+# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4)
+# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3
+# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc
+# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f
+
+#define PR_RISCV_SET_ICACHE_FLUSH_CTX 71
+# define PR_RISCV_CTX_SW_FENCEI_ON 0
+# define PR_RISCV_CTX_SW_FENCEI_OFF 1
+# define PR_RISCV_SCOPE_PER_PROCESS 0
+# define PR_RISCV_SCOPE_PER_THREAD 1
+
+/* PowerPC Dynamic Execution Control Register (DEXCR) controls */
+#define PR_PPC_GET_DEXCR 72
+#define PR_PPC_SET_DEXCR 73
+/* DEXCR aspect to act on */
+# define PR_PPC_DEXCR_SBHE 0 /* Speculative branch hint enable */
+# define PR_PPC_DEXCR_IBRTPD 1 /* Indirect branch recurrent target prediction disable */
+# define PR_PPC_DEXCR_SRAPD 2 /* Subroutine return address prediction disable */
+# define PR_PPC_DEXCR_NPHIE 3 /* Non-privileged hash instruction enable */
+/* Action to apply / return */
+# define PR_PPC_DEXCR_CTRL_EDITABLE 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */
+# define PR_PPC_DEXCR_CTRL_SET 0x2 /* Set the aspect for this process */
+# define PR_PPC_DEXCR_CTRL_CLEAR 0x4 /* Clear the aspect for this process */
+# define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */
+# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
+# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+
+#endif /* _LINUX_PRCTL_H */
diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h
index 67626d535316..887a25286441 100644
--- a/tools/include/uapi/linux/stat.h
+++ b/tools/include/uapi/linux/stat.h
@@ -126,9 +126,15 @@ struct statx {
__u64 stx_mnt_id;
__u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */
__u32 stx_dio_offset_align; /* File offset alignment for direct I/O */
- __u64 stx_subvol; /* Subvolume identifier */
/* 0xa0 */
- __u64 __spare3[11]; /* Spare space for future expansion */
+ __u64 stx_subvol; /* Subvolume identifier */
+ __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */
+ __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */
+ /* 0xb0 */
+ __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */
+ __u32 __spare1[1];
+ /* 0xb8 */
+ __u64 __spare3[9]; /* Spare space for future expansion */
/* 0x100 */
};
@@ -157,6 +163,7 @@ struct statx {
#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */
#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */
+#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
@@ -192,6 +199,7 @@ struct statx {
#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */
#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */
#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */
+#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */
#endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index c3e4871967bc..2178862bb114 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -100,3 +100,23 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
return true;
return false;
}
+
+void __bitmap_clear(unsigned long *map, unsigned int start, int len)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const unsigned int size = start + len;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+ while (len - bits_to_clear >= 0) {
+ *p &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ p++;
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ *p &= ~mask_to_clear;
+ }
+}
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 5dbca76b953f..894860111ddb 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -1559,10 +1559,12 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
* Clang for BPF target generates func_proto with no
* args as a func_proto with a single void arg (e.g.,
* `int (*f)(void)` vs just `int (*f)()`). We are
- * going to pretend there are no args for such case.
+ * going to emit valid empty args (void) syntax for
+ * such case. Similarly and conveniently, valid
+ * no args case can be special-cased here as well.
*/
- if (vlen == 1 && p->type == 0) {
- btf_dump_printf(d, ")");
+ if (vlen == 0 || (vlen == 1 && p->type == 0)) {
+ btf_dump_printf(d, "void)");
return;
}
diff --git a/tools/lib/list_sort.c b/tools/lib/list_sort.c
index 10c067e3a8d2..69affa251fa7 100644
--- a/tools/lib/list_sort.c
+++ b/tools/lib/list_sort.c
@@ -52,7 +52,6 @@ static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
struct list_head *a, struct list_head *b)
{
struct list_head *tail = head;
- u8 count = 0;
for (;;) {
/* if equal, take 'a' -- important for sort stability */
@@ -78,15 +77,6 @@ static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
/* Finish linking remainder of list b on to tail */
tail->next = b;
do {
- /*
- * If the merge is highly unbalanced (e.g. the input is
- * already sorted), this loop may run many iterations.
- * Continue callbacks to the client even though no
- * element comparison is needed, so the client's cmp()
- * routine can invoke cond_resched() periodically.
- */
- if (unlikely(!++count))
- cmp(priv, b, b);
b->prev = tail;
tail = b;
b = b->next;
diff --git a/tools/mm/Makefile b/tools/mm/Makefile
index 7bb03606b9ea..15791c1c5b28 100644
--- a/tools/mm/Makefile
+++ b/tools/mm/Makefile
@@ -3,7 +3,7 @@
#
include ../scripts/Makefile.include
-BUILD_TARGETS=page-types slabinfo page_owner_sort
+BUILD_TARGETS=page-types slabinfo page_owner_sort thp_swap_allocator_test
INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps
LIB_DIR = ../lib/api
diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c
new file mode 100644
index 000000000000..83afc52275a5
--- /dev/null
+++ b/tools/mm/thp_swap_allocator_test.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * thp_swap_allocator_test
+ *
+ * The purpose of this test program is helping check if THP swpout
+ * can correctly get swap slots to swap out as a whole instead of
+ * being split. It randomly releases swap entries through madvise
+ * DONTNEED and swapin/out on two memory areas: a memory area for
+ * 64KB THP and the other area for small folios. The second memory
+ * can be enabled by "-s".
+ * Before running the program, we need to setup a zRAM or similar
+ * swap device by:
+ * echo lzo > /sys/block/zram0/comp_algorithm
+ * echo 64M > /sys/block/zram0/disksize
+ * echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled
+ * echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled
+ * mkswap /dev/zram0
+ * swapon /dev/zram0
+ * The expected result should be 0% anon swpout fallback ratio w/ or
+ * w/o "-s".
+ *
+ * Author(s): Barry Song <v-songbaohua@oppo.com>
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <time.h>
+
+#define MEMSIZE_MTHP (60 * 1024 * 1024)
+#define MEMSIZE_SMALLFOLIO (4 * 1024 * 1024)
+#define ALIGNMENT_MTHP (64 * 1024)
+#define ALIGNMENT_SMALLFOLIO (4 * 1024)
+#define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024)
+#define TOTAL_DONTNEED_SMALLFOLIO (1 * 1024 * 1024)
+#define MTHP_FOLIO_SIZE (64 * 1024)
+
+#define SWPOUT_PATH \
+ "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout"
+#define SWPOUT_FALLBACK_PATH \
+ "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback"
+
+static void *aligned_alloc_mem(size_t size, size_t alignment)
+{
+ void *mem = NULL;
+
+ if (posix_memalign(&mem, alignment, size) != 0) {
+ perror("posix_memalign");
+ return NULL;
+ }
+ return mem;
+}
+
+/*
+ * This emulates the behavior of native libc and Java heap,
+ * as well as process exit and munmap. It helps generate mTHP
+ * and ensures that iterations can proceed with mTHP, as we
+ * currently don't support large folios swap-in.
+ */
+static void random_madvise_dontneed(void *mem, size_t mem_size,
+ size_t align_size, size_t total_dontneed_size)
+{
+ size_t num_pages = total_dontneed_size / align_size;
+ size_t i;
+ size_t offset;
+ void *addr;
+
+ for (i = 0; i < num_pages; ++i) {
+ offset = (rand() % (mem_size / align_size)) * align_size;
+ addr = (char *)mem + offset;
+ if (madvise(addr, align_size, MADV_DONTNEED) != 0)
+ perror("madvise dontneed");
+
+ memset(addr, 0x11, align_size);
+ }
+}
+
+static void random_swapin(void *mem, size_t mem_size,
+ size_t align_size, size_t total_swapin_size)
+{
+ size_t num_pages = total_swapin_size / align_size;
+ size_t i;
+ size_t offset;
+ void *addr;
+
+ for (i = 0; i < num_pages; ++i) {
+ offset = (rand() % (mem_size / align_size)) * align_size;
+ addr = (char *)mem + offset;
+ memset(addr, 0x11, align_size);
+ }
+}
+
+static unsigned long read_stat(const char *path)
+{
+ FILE *file;
+ unsigned long value;
+
+ file = fopen(path, "r");
+ if (!file) {
+ perror("fopen");
+ return 0;
+ }
+
+ if (fscanf(file, "%lu", &value) != 1) {
+ perror("fscanf");
+ fclose(file);
+ return 0;
+ }
+
+ fclose(file);
+ return value;
+}
+
+int main(int argc, char *argv[])
+{
+ int use_small_folio = 0, aligned_swapin = 0;
+ void *mem1 = NULL, *mem2 = NULL;
+ int i;
+
+ for (i = 1; i < argc; ++i) {
+ if (strcmp(argv[i], "-s") == 0)
+ use_small_folio = 1;
+ else if (strcmp(argv[i], "-a") == 0)
+ aligned_swapin = 1;
+ }
+
+ mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP);
+ if (mem1 == NULL) {
+ fprintf(stderr, "Failed to allocate large folios memory\n");
+ return EXIT_FAILURE;
+ }
+
+ if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) {
+ perror("madvise hugepage for mem1");
+ free(mem1);
+ return EXIT_FAILURE;
+ }
+
+ if (use_small_folio) {
+ mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
+ if (mem2 == NULL) {
+ fprintf(stderr, "Failed to allocate small folios memory\n");
+ free(mem1);
+ return EXIT_FAILURE;
+ }
+
+ if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) {
+ perror("madvise nohugepage for mem2");
+ free(mem1);
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* warm-up phase to occupy the swapfile */
+ memset(mem1, 0x11, MEMSIZE_MTHP);
+ madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT);
+ if (use_small_folio) {
+ memset(mem2, 0x11, MEMSIZE_SMALLFOLIO);
+ madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT);
+ }
+
+ /* iterations with newly created mTHP, swap-in, and swap-out */
+ for (i = 0; i < 100; ++i) {
+ unsigned long initial_swpout;
+ unsigned long initial_swpout_fallback;
+ unsigned long final_swpout;
+ unsigned long final_swpout_fallback;
+ unsigned long swpout_inc;
+ unsigned long swpout_fallback_inc;
+ double fallback_percentage;
+
+ initial_swpout = read_stat(SWPOUT_PATH);
+ initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+ /*
+ * The following setup creates a 1:1 ratio of mTHP to small folios
+ * since large folio swap-in isn't supported yet. Once we support
+ * mTHP swap-in, we'll likely need to reduce MEMSIZE_MTHP and
+ * increase MEMSIZE_SMALLFOLIO to maintain the ratio.
+ */
+ random_swapin(mem1, MEMSIZE_MTHP,
+ aligned_swapin ? ALIGNMENT_MTHP : ALIGNMENT_SMALLFOLIO,
+ TOTAL_DONTNEED_MTHP);
+ random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP,
+ TOTAL_DONTNEED_MTHP);
+
+ if (use_small_folio) {
+ random_swapin(mem2, MEMSIZE_SMALLFOLIO,
+ ALIGNMENT_SMALLFOLIO,
+ TOTAL_DONTNEED_SMALLFOLIO);
+ }
+
+ if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) {
+ perror("madvise pageout for mem1");
+ free(mem1);
+ if (mem2 != NULL)
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+
+ if (use_small_folio) {
+ if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) {
+ perror("madvise pageout for mem2");
+ free(mem1);
+ free(mem2);
+ return EXIT_FAILURE;
+ }
+ }
+
+ final_swpout = read_stat(SWPOUT_PATH);
+ final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH);
+
+ swpout_inc = final_swpout - initial_swpout;
+ swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback;
+
+ fallback_percentage = (double)swpout_fallback_inc /
+ (swpout_fallback_inc + swpout_inc) * 100;
+
+ printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n",
+ i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage);
+ }
+
+ free(mem1);
+ if (mem2 != NULL)
+ free(mem2);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0a33d9195b7a..01237d167223 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1202,6 +1202,8 @@ static const char *uaccess_safe_builtin[] = {
"__sanitizer_cov_trace_switch",
/* KMSAN */
"kmsan_copy_to_user",
+ "kmsan_disable_current",
+ "kmsan_enable_current",
"kmsan_report",
"kmsan_unpoison_entry_regs",
"kmsan_unpoison_memory",
diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt
index 3766886c4bca..83dc87c662b6 100644
--- a/tools/perf/Documentation/Build.txt
+++ b/tools/perf/Documentation/Build.txt
@@ -71,3 +71,31 @@ supported by GCC. UBSan detects undefined behaviors of programs at runtime.
$ UBSAN_OPTIONS=print_stacktrace=1 ./perf record -a
If UBSan detects any problem at runtime, it outputs a “runtime error:” message.
+
+4) Cross compilation
+====================
+As Multiarch is commonly supported in Linux distributions, we can install
+libraries for multiple architectures on the same system and then cross-compile
+Linux perf. For example, Aarch64 libraries and toolchains can be installed on
+an x86_64 machine, allowing us to compile perf for an Aarch64 target.
+
+Below is the command for building the perf with dynamic linking.
+
+ $ cd /path/to/Linux
+ $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf
+
+For static linking, the option `LDFLAGS="-static"` is required.
+
+ $ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \
+ LDFLAGS="-static" -C tools/perf
+
+In the embedded system world, a use case is to explicitly specify the package
+configuration paths for cross building:
+
+ $ PKG_CONFIG_SYSROOT_DIR="/path/to/cross/build/sysroot" \
+ PKG_CONFIG_LIBDIR="/usr/lib/:/usr/local/lib" \
+ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/perf
+
+In this case, the variable PKG_CONFIG_SYSROOT_DIR can be used alongside the
+variable PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH to prepend the sysroot path to
+the library paths for cross compilation.
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index a4829b6532d8..fa679db61f62 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -152,7 +152,17 @@ ifdef LIBDW_DIR
endif
DWARFLIBS := -ldw
ifeq ($(findstring -static,${LDFLAGS}),-static)
- DWARFLIBS += -lelf -lebl -ldl -lz -llzma -lbz2
+ DWARFLIBS += -lelf -ldl -lz -llzma -lbz2 -lzstd
+
+ LIBDW_VERSION := $(shell $(PKG_CONFIG) --modversion libdw)
+ LIBDW_VERSION_1 := $(word 1, $(subst ., ,$(LIBDW_VERSION)))
+ LIBDW_VERSION_2 := $(word 2, $(subst ., ,$(LIBDW_VERSION)))
+
+ # Elfutils merged libebl.a into libdw.a starting from version 0.177,
+ # Link libebl.a only if libdw is older than this version.
+ ifeq ($(shell test $(LIBDW_VERSION_2) -lt 177; echo $$?),0)
+ DWARFLIBS += -lebl
+ endif
endif
FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) $(DWARFLIBS)
@@ -182,20 +192,15 @@ endif
FEATURE_CHECK_CFLAGS-libzstd := $(LIBZSTD_CFLAGS)
FEATURE_CHECK_LDFLAGS-libzstd := $(LIBZSTD_LDFLAGS)
+# for linking with debug library, run like:
+# make DEBUG=1 PKG_CONFIG_PATH=/opt/libtraceevent/(lib|lib64)/pkgconfig
+
ifneq ($(NO_LIBTRACEEVENT),1)
ifeq ($(call get-executable,$(PKG_CONFIG)),)
$(error Error: $(PKG_CONFIG) needed by libtraceevent is missing on this system, please install it)
endif
endif
-# for linking with debug library, run like:
-# make DEBUG=1 PKG_CONFIG_PATH=/opt/libtraceevent/(lib|lib64)/pkgconfig
-FEATURE_CHECK_CFLAGS-libtraceevent := $(shell $(PKG_CONFIG) --cflags libtraceevent 2>/dev/null)
-FEATURE_CHECK_LDFLAGS-libtraceevent := $(shell $(PKG_CONFIG) --libs libtraceevent 2>/dev/null)
-
-FEATURE_CHECK_CFLAGS-libtracefs := $(shell $(PKG_CONFIG) --cflags libtracefs 2>/dev/null)
-FEATURE_CHECK_LDFLAGS-libtracefs := $(shell $(PKG_CONFIG) --libs libtracefs 2>/dev/null)
-
FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi
# include ARCH specific config
-include $(src-perf)/arch/$(SRCARCH)/Makefile
@@ -301,6 +306,11 @@ endif
ifdef PYTHON_CONFIG
PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) $(PYTHON_CONFIG_LDFLAGS) 2>/dev/null)
+ # Update the python flags for cross compilation
+ ifdef CROSS_COMPILE
+ PYTHON_NATIVE := $(shell echo $(PYTHON_EMBED_LDOPTS) | sed 's/\(-L.*\/\)\(.*-linux-gnu\).*/\2/')
+ PYTHON_EMBED_LDOPTS := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EMBED_LDOPTS))
+ endif
PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --includes 2>/dev/null)
@@ -902,6 +912,9 @@ else
PYTHON_SETUPTOOLS_INSTALLED := $(shell $(PYTHON) -c 'import setuptools;' 2> /dev/null && echo "yes" || echo "no")
ifeq ($(PYTHON_SETUPTOOLS_INSTALLED), yes)
PYTHON_EXTENSION_SUFFIX := $(shell $(PYTHON) -c 'from importlib import machinery; print(machinery.EXTENSION_SUFFIXES[0])')
+ ifdef CROSS_COMPILE
+ PYTHON_EXTENSION_SUFFIX := $(subst $(PYTHON_NATIVE),$(shell $(CC) -dumpmachine),$(PYTHON_EXTENSION_SUFFIX))
+ endif
LANG_BINDINGS += $(obj-perf)python/perf$(PYTHON_EXTENSION_SUFFIX)
else
$(warning Missing python setuptools, the python binding won't be built, please install python3-setuptools or equivalent)
@@ -1206,6 +1219,8 @@ ifneq ($(NO_LIBTRACEEVENT),1)
LIBTRACEFS_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEFS_VERSION)))
LIBTRACEFS_VERSION_CPP := $(shell expr $(LIBTRACEFS_VERSION_1) \* 255 \* 255 + $(LIBTRACEFS_VERSION_2) \* 255 + $(LIBTRACEFS_VERSION_3))
CFLAGS += -DLIBTRACEFS_VERSION=$(LIBTRACEFS_VERSION_CPP)
+ else
+ $(warning libtracefs is missing. Please install libtracefs-dev/libtracefs-devel)
endif
endif
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 175e4c7898f0..f8148db5fc38 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -193,7 +193,32 @@ HOSTLD ?= ld
HOSTAR ?= ar
CLANG ?= clang
-PKG_CONFIG = $(CROSS_COMPILE)pkg-config
+# Some distros provide the command $(CROSS_COMPILE)pkg-config for
+# searching packges installed with Multiarch. Use it for cross
+# compilation if it is existed.
+ifneq (, $(shell which $(CROSS_COMPILE)pkg-config))
+ PKG_CONFIG ?= $(CROSS_COMPILE)pkg-config
+else
+ PKG_CONFIG ?= pkg-config
+
+ # PKG_CONFIG_PATH or PKG_CONFIG_LIBDIR, alongside PKG_CONFIG_SYSROOT_DIR
+ # for modified system root, is required for the cross compilation.
+ # If these PKG_CONFIG environment variables are not set, Multiarch library
+ # paths are used instead.
+ ifdef CROSS_COMPILE
+ ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),)
+ CROSS_ARCH = $(shell $(CC) -dumpmachine)
+ PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/share/pkgconfig/
+ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/share/pkgconfig/
+ export PKG_CONFIG_LIBDIR
+ $(warning Missing PKG_CONFIG_LIBDIR, PKG_CONFIG_PATH and PKG_CONFIG_SYSROOT_DIR for cross compilation,)
+ $(warning set PKG_CONFIG_LIBDIR for using Multiarch libs.)
+ endif
+ endif
+endif
RM = rm -f
LN = ln -f
diff --git a/tools/perf/arch/loongarch/Makefile b/tools/perf/arch/loongarch/Makefile
index 3992a67a87d9..c89d6bb6b184 100644
--- a/tools/perf/arch/loongarch/Makefile
+++ b/tools/perf/arch/loongarch/Makefile
@@ -4,6 +4,7 @@ PERF_HAVE_DWARF_REGS := 1
endif
PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
PERF_HAVE_JITDUMP := 1
+HAVE_KVM_STAT_SUPPORT := 1
#
# Syscall table generation for perf
diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build
index 2386ebbf6dd4..b6b97de48233 100644
--- a/tools/perf/arch/loongarch/util/Build
+++ b/tools/perf/arch/loongarch/util/Build
@@ -1,5 +1,7 @@
+perf-util-y += header.o
perf-util-y += perf_regs.o
perf-util-$(CONFIG_DWARF) += dwarf-regs.o
perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
diff --git a/tools/perf/arch/loongarch/util/header.c b/tools/perf/arch/loongarch/util/header.c
new file mode 100644
index 000000000000..d962dff55512
--- /dev/null
+++ b/tools/perf/arch/loongarch/util/header.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implementation of get_cpuid().
+ *
+ * Author: Nikita Shubin <n.shubin@yadro.com>
+ * Bibo Mao <maobibo@loongson.cn>
+ * Huacai Chen <chenhuacai@loongson.cn>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <api/fs/fs.h>
+#include <errno.h>
+#include "util/debug.h"
+#include "util/header.h"
+
+/*
+ * Output example from /proc/cpuinfo
+ * CPU Family : Loongson-64bit
+ * Model Name : Loongson-3C5000
+ * CPU Revision : 0x10
+ * FPU Revision : 0x01
+ */
+#define CPUINFO_MODEL "Model Name"
+#define CPUINFO "/proc/cpuinfo"
+
+static char *_get_field(const char *line)
+{
+ char *line2, *nl;
+
+ line2 = strrchr(line, ' ');
+ if (!line2)
+ return NULL;
+
+ line2++;
+ nl = strrchr(line, '\n');
+ if (!nl)
+ return NULL;
+
+ return strndup(line2, nl - line2);
+}
+
+static char *_get_cpuid(void)
+{
+ unsigned long line_sz;
+ char *line, *model, *cpuid;
+ FILE *file;
+
+ file = fopen(CPUINFO, "r");
+ if (file == NULL)
+ return NULL;
+
+ line = model = cpuid = NULL;
+ while (getline(&line, &line_sz, file) != -1) {
+ if (strncmp(line, CPUINFO_MODEL, strlen(CPUINFO_MODEL)))
+ continue;
+
+ model = _get_field(line);
+ if (!model)
+ goto out_free;
+ break;
+ }
+
+ if (model && (asprintf(&cpuid, "%s", model) < 0))
+ cpuid = NULL;
+
+out_free:
+ fclose(file);
+ free(model);
+ return cpuid;
+}
+
+int get_cpuid(char *buffer, size_t sz)
+{
+ int ret = 0;
+ char *cpuid = _get_cpuid();
+
+ if (!cpuid)
+ return EINVAL;
+
+ if (sz < strlen(cpuid)) {
+ ret = ENOBUFS;
+ goto out_free;
+ }
+
+ scnprintf(buffer, sz, "%s", cpuid);
+
+out_free:
+ free(cpuid);
+ return ret;
+}
+
+char *get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
+{
+ return _get_cpuid();
+}
diff --git a/tools/perf/arch/loongarch/util/kvm-stat.c b/tools/perf/arch/loongarch/util/kvm-stat.c
new file mode 100644
index 000000000000..a7859a3a9a51
--- /dev/null
+++ b/tools/perf/arch/loongarch/util/kvm-stat.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <memory.h>
+#include "util/kvm-stat.h"
+#include "util/parse-events.h"
+#include "util/debug.h"
+#include "util/evsel.h"
+#include "util/evlist.h"
+#include "util/pmus.h"
+
+#define LOONGARCH_EXCEPTION_INT 0
+#define LOONGARCH_EXCEPTION_PIL 1
+#define LOONGARCH_EXCEPTION_PIS 2
+#define LOONGARCH_EXCEPTION_PIF 3
+#define LOONGARCH_EXCEPTION_PME 4
+#define LOONGARCH_EXCEPTION_FPD 15
+#define LOONGARCH_EXCEPTION_SXD 16
+#define LOONGARCH_EXCEPTION_ASXD 17
+#define LOONGARCH_EXCEPTION_GSPR 22
+#define LOONGARCH_EXCEPTION_CPUCFG 100
+#define LOONGARCH_EXCEPTION_CSR 101
+#define LOONGARCH_EXCEPTION_IOCSR 102
+#define LOONGARCH_EXCEPTION_IDLE 103
+#define LOONGARCH_EXCEPTION_OTHERS 104
+#define LOONGARCH_EXCEPTION_HVC 23
+
+#define loongarch_exception_type \
+ {LOONGARCH_EXCEPTION_INT, "Interrupt" }, \
+ {LOONGARCH_EXCEPTION_PIL, "Mem Read" }, \
+ {LOONGARCH_EXCEPTION_PIS, "Mem Store" }, \
+ {LOONGARCH_EXCEPTION_PIF, "Inst Fetch" }, \
+ {LOONGARCH_EXCEPTION_PME, "Mem Modify" }, \
+ {LOONGARCH_EXCEPTION_FPD, "FPU" }, \
+ {LOONGARCH_EXCEPTION_SXD, "LSX" }, \
+ {LOONGARCH_EXCEPTION_ASXD, "LASX" }, \
+ {LOONGARCH_EXCEPTION_GSPR, "Privilege Error" }, \
+ {LOONGARCH_EXCEPTION_HVC, "Hypercall" }, \
+ {LOONGARCH_EXCEPTION_CPUCFG, "CPUCFG" }, \
+ {LOONGARCH_EXCEPTION_CSR, "CSR" }, \
+ {LOONGARCH_EXCEPTION_IOCSR, "IOCSR" }, \
+ {LOONGARCH_EXCEPTION_IDLE, "Idle" }, \
+ {LOONGARCH_EXCEPTION_OTHERS, "Others" }
+
+define_exit_reasons_table(loongarch_exit_reasons, loongarch_exception_type);
+
+const char *vcpu_id_str = "vcpu_id";
+const char *kvm_exit_reason = "reason";
+const char *kvm_entry_trace = "kvm:kvm_enter";
+const char *kvm_reenter_trace = "kvm:kvm_reenter";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+const char *kvm_events_tp[] = {
+ "kvm:kvm_enter",
+ "kvm:kvm_reenter",
+ "kvm:kvm_exit",
+ "kvm:kvm_exit_gspr",
+ NULL,
+};
+
+static bool event_begin(struct evsel *evsel,
+ struct perf_sample *sample, struct event_key *key)
+{
+ return exit_event_begin(evsel, sample, key);
+}
+
+static bool event_end(struct evsel *evsel,
+ struct perf_sample *sample __maybe_unused,
+ struct event_key *key __maybe_unused)
+{
+ /*
+ * LoongArch kvm is different with other architectures
+ *
+ * There is kvm:kvm_reenter or kvm:kvm_enter event adjacent with
+ * kvm:kvm_exit event.
+ * kvm:kvm_enter means returning to vmm and then to guest
+ * kvm:kvm_reenter means returning to guest immediately
+ */
+ return evsel__name_is(evsel, kvm_entry_trace) || evsel__name_is(evsel, kvm_reenter_trace);
+}
+
+static void event_gspr_get_key(struct evsel *evsel,
+ struct perf_sample *sample, struct event_key *key)
+{
+ unsigned int insn;
+
+ key->key = LOONGARCH_EXCEPTION_OTHERS;
+ insn = evsel__intval(evsel, sample, "inst_word");
+
+ switch (insn >> 24) {
+ case 0:
+ /* CPUCFG inst trap */
+ if ((insn >> 10) == 0x1b)
+ key->key = LOONGARCH_EXCEPTION_CPUCFG;
+ break;
+ case 4:
+ /* CSR inst trap */
+ key->key = LOONGARCH_EXCEPTION_CSR;
+ break;
+ case 6:
+ /* IOCSR inst trap */
+ if ((insn >> 15) == 0xc90)
+ key->key = LOONGARCH_EXCEPTION_IOCSR;
+ else if ((insn >> 15) == 0xc91)
+ /* Idle inst trap */
+ key->key = LOONGARCH_EXCEPTION_IDLE;
+ break;
+ default:
+ key->key = LOONGARCH_EXCEPTION_OTHERS;
+ break;
+ }
+}
+
+static struct child_event_ops child_events[] = {
+ { .name = "kvm:kvm_exit_gspr", .get_key = event_gspr_get_key },
+ { NULL, NULL },
+};
+
+static struct kvm_events_ops exit_events = {
+ .is_begin_event = event_begin,
+ .is_end_event = event_end,
+ .child_ops = child_events,
+ .decode_key = exit_event_decode_key,
+ .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+ { .name = "vmexit", .ops = &exit_events, },
+ { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+ NULL,
+};
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+ kvm->exit_reasons_isa = "loongarch64";
+ kvm->exit_reasons = loongarch_exit_reasons;
+ return 0;
+}
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 3656f1ca7a21..ebae8415dfbb 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -230,8 +230,10 @@
178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend
179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64
179 64 pread64 sys_pread64
+179 spu pread64 sys_pread64
180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64
180 64 pwrite64 sys_pwrite64
+180 spu pwrite64 sys_pwrite64
181 common chown sys_chown
182 common getcwd sys_getcwd
183 common capget sys_capget
@@ -246,6 +248,7 @@
190 common ugetrlimit sys_getrlimit compat_sys_getrlimit
191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead
191 64 readahead sys_readahead
+191 spu readahead sys_readahead
192 32 mmap2 sys_mmap2 compat_sys_mmap2
193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64
194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64
@@ -293,6 +296,7 @@
232 nospu set_tid_address sys_set_tid_address
233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64
233 64 fadvise64 sys_fadvise64
+233 spu fadvise64 sys_fadvise64
234 nospu exit_group sys_exit_group
235 nospu lookup_dcookie sys_ni_syscall
236 common epoll_create sys_epoll_create
@@ -502,7 +506,7 @@
412 32 utimensat_time64 sys_utimensat sys_utimensat
413 32 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64
414 32 ppoll_time64 sys_ppoll compat_sys_ppoll_time64
-416 32 io_pgetevents_time64 sys_io_pgetevents sys_io_pgetevents
+416 32 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64
417 32 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64
418 32 mq_timedsend_time64 sys_mq_timedsend sys_mq_timedsend
419 32 mq_timedreceive_time64 sys_mq_timedreceive sys_mq_timedreceive
diff --git a/tools/perf/arch/riscv/Makefile b/tools/perf/arch/riscv/Makefile
index a8d25d005207..90c3c476a242 100644
--- a/tools/perf/arch/riscv/Makefile
+++ b/tools/perf/arch/riscv/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
endif
PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
PERF_HAVE_JITDUMP := 1
+HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index 65ec3c66a375..f865cb0489ec 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -1,5 +1,6 @@
perf-util-y += perf_regs.o
perf-util-y += header.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
perf-util-$(CONFIG_DWARF) += dwarf-regs.o
perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/riscv/util/kvm-stat.c b/tools/perf/arch/riscv/util/kvm-stat.c
new file mode 100644
index 000000000000..491aef449d1a
--- /dev/null
+++ b/tools/perf/arch/riscv/util/kvm-stat.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Arch specific functions for perf kvm stat.
+ *
+ * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd.
+ *
+ */
+#include <errno.h>
+#include <memory.h>
+#include "../../../util/evsel.h"
+#include "../../../util/kvm-stat.h"
+#include "riscv_exception_types.h"
+#include "debug.h"
+
+define_exit_reasons_table(riscv_exit_reasons, kvm_riscv_exception_class);
+
+const char *vcpu_id_str = "id";
+const char *kvm_exit_reason = "scause";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
+const char *kvm_events_tp[] = {
+ "kvm:kvm_entry",
+ "kvm:kvm_exit",
+ NULL,
+};
+
+static void event_get_key(struct evsel *evsel,
+ struct perf_sample *sample,
+ struct event_key *key)
+{
+ key->info = 0;
+ key->key = evsel__intval(evsel, sample, kvm_exit_reason);
+ key->exit_reasons = riscv_exit_reasons;
+}
+
+static bool event_begin(struct evsel *evsel,
+ struct perf_sample *sample __maybe_unused,
+ struct event_key *key __maybe_unused)
+{
+ return evsel__name_is(evsel, kvm_entry_trace);
+}
+
+static bool event_end(struct evsel *evsel,
+ struct perf_sample *sample,
+ struct event_key *key)
+{
+ if (evsel__name_is(evsel, kvm_exit_trace)) {
+ event_get_key(evsel, sample, key);
+ return true;
+ }
+ return false;
+}
+
+static struct kvm_events_ops exit_events = {
+ .is_begin_event = event_begin,
+ .is_end_event = event_end,
+ .decode_key = exit_event_decode_key,
+ .name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+ {
+ .name = "vmexit",
+ .ops = &exit_events,
+ },
+ { NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+ NULL,
+};
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+ kvm->exit_reasons_isa = "riscv64";
+ return 0;
+}
diff --git a/tools/perf/arch/riscv/util/riscv_exception_types.h b/tools/perf/arch/riscv/util/riscv_exception_types.h
new file mode 100644
index 000000000000..c49b8fa5e847
--- /dev/null
+++ b/tools/perf/arch/riscv/util/riscv_exception_types.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef ARCH_PERF_RISCV_EXCEPTION_TYPES_H
+#define ARCH_PERF_RISCV_EXCEPTION_TYPES_H
+
+#define EXC_INST_MISALIGNED 0
+#define EXC_INST_ACCESS 1
+#define EXC_INST_ILLEGAL 2
+#define EXC_BREAKPOINT 3
+#define EXC_LOAD_MISALIGNED 4
+#define EXC_LOAD_ACCESS 5
+#define EXC_STORE_MISALIGNED 6
+#define EXC_STORE_ACCESS 7
+#define EXC_SYSCALL 8
+#define EXC_HYPERVISOR_SYSCALL 9
+#define EXC_SUPERVISOR_SYSCALL 10
+#define EXC_INST_PAGE_FAULT 12
+#define EXC_LOAD_PAGE_FAULT 13
+#define EXC_STORE_PAGE_FAULT 15
+#define EXC_INST_GUEST_PAGE_FAULT 20
+#define EXC_LOAD_GUEST_PAGE_FAULT 21
+#define EXC_VIRTUAL_INST_FAULT 22
+#define EXC_STORE_GUEST_PAGE_FAULT 23
+
+#define EXC(x) {EXC_##x, #x }
+
+#define kvm_riscv_exception_class \
+ EXC(INST_MISALIGNED), EXC(INST_ACCESS), EXC(INST_ILLEGAL), \
+ EXC(BREAKPOINT), EXC(LOAD_MISALIGNED), EXC(LOAD_ACCESS), \
+ EXC(STORE_MISALIGNED), EXC(STORE_ACCESS), EXC(SYSCALL), \
+ EXC(HYPERVISOR_SYSCALL), EXC(SUPERVISOR_SYSCALL), \
+ EXC(INST_PAGE_FAULT), EXC(LOAD_PAGE_FAULT), EXC(STORE_PAGE_FAULT), \
+ EXC(INST_GUEST_PAGE_FAULT), EXC(LOAD_GUEST_PAGE_FAULT), \
+ EXC(VIRTUAL_INST_FAULT), EXC(STORE_GUEST_PAGE_FAULT)
+
+#endif /* ARCH_PERF_RISCV_EXCEPTION_TYPES_H */
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index bd0fee24ad10..01071182763e 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -418,7 +418,7 @@
412 32 utimensat_time64 - sys_utimensat
413 32 pselect6_time64 - compat_sys_pselect6_time64
414 32 ppoll_time64 - compat_sys_ppoll_time64
-416 32 io_pgetevents_time64 - sys_io_pgetevents
+416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64
417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64
418 32 mq_timedsend_time64 - sys_mq_timedsend
419 32 mq_timedreceive_time64 - sys_mq_timedreceive
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index a396f6e6ab5b..7093ee21c0d1 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
#
# 64-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
@@ -68,7 +69,7 @@
57 common fork sys_fork
58 common vfork sys_vfork
59 64 execve sys_execve
-60 common exit sys_exit
+60 common exit sys_exit - noreturn
61 common wait4 sys_wait4
62 common kill sys_kill
63 common uname sys_newuname
@@ -239,7 +240,7 @@
228 common clock_gettime sys_clock_gettime
229 common clock_getres sys_clock_getres
230 common clock_nanosleep sys_clock_nanosleep
-231 common exit_group sys_exit_group
+231 common exit_group sys_exit_group - noreturn
232 common epoll_wait sys_epoll_wait
233 common epoll_ctl sys_epoll_ctl
234 common tgkill sys_tgkill
@@ -343,6 +344,7 @@
332 common statx sys_statx
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
+335 common uretprobe sys_uretprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index de76bbc50bfb..5c9335fff2d3 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <internal/lib.h>
+#include <inttypes.h>
#include <subcmd/parse-options.h>
#include <api/fd/array.h>
#include <api/fs/fs.h>
@@ -688,7 +689,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
/* lock */
csv_sep, daemon->base, "lock");
- fprintf(out, "%c%lu",
+ fprintf(out, "%c%" PRIu64,
/* session up time */
csv_sep, (curr - daemon->start) / 60);
@@ -700,7 +701,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
daemon->base, SESSION_OUTPUT);
fprintf(out, " lock: %s/lock\n",
daemon->base);
- fprintf(out, " up: %lu minutes\n",
+ fprintf(out, " up: %" PRIu64 " minutes\n",
(curr - daemon->start) / 60);
}
}
@@ -727,7 +728,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
/* session ack */
csv_sep, session->base, SESSION_ACK);
- fprintf(out, "%c%lu",
+ fprintf(out, "%c%" PRIu64,
/* session up time */
csv_sep, (curr - session->start) / 60);
@@ -745,7 +746,7 @@ static int cmd_session_list(struct daemon *daemon, union cmd *cmd, FILE *out)
session->base, SESSION_CONTROL);
fprintf(out, " ack: %s/%s\n",
session->base, SESSION_ACK);
- fprintf(out, " up: %lu minutes\n",
+ fprintf(out, " up: %" PRIu64 " minutes\n",
(curr - session->start) / 60);
}
}
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json
index 9b4a032186a7..7149caec4f80 100644
--- a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json
+++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json
@@ -36,7 +36,7 @@
"ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
},
{
- "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+ "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
diff --git a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json
index a9939823b14b..0c9b9a2d2958 100644
--- a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json
+++ b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json
@@ -74,7 +74,7 @@
{
"PublicDescription": "Sent SFENCE.VMA with ASID request to other HART event",
"ConfigCode": "0x800000000000000c",
- "EventName": "FW_SFENCE_VMA_RECEIVED",
+ "EventName": "FW_SFENCE_VMA_ASID_SENT",
"BriefDescription": "Sent SFENCE.VMA with ASID request to other HART event"
},
{
diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json
index 9b4a032186a7..7149caec4f80 100644
--- a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json
+++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json
@@ -36,7 +36,7 @@
"ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
},
{
- "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+ "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
diff --git a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json
index 9b4a032186a7..7149caec4f80 100644
--- a/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json
+++ b/tools/perf/pmu-events/arch/riscv/starfive/dubhe-80/firmware.json
@@ -36,7 +36,7 @@
"ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
},
{
- "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+ "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
diff --git a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json
index 9b4a032186a7..7149caec4f80 100644
--- a/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json
+++ b/tools/perf/pmu-events/arch/riscv/thead/c900-legacy/firmware.json
@@ -36,7 +36,7 @@
"ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
},
{
- "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+ "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT"
},
{
"ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index e30fd55f8e51..cd3b480d20bd 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -26,7 +26,6 @@ static bool is_ignored_symbol(const char *name, char type)
* when --all-symbols is specified so exclude them to get a
* stable symbol list.
*/
- "kallsyms_addresses",
"kallsyms_offsets",
"kallsyms_relative_base",
"kallsyms_num_syms",
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index 89d16b90370b..df9cdb8bbfb8 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -76,7 +76,7 @@ struct msghdr {
__kernel_size_t msg_controllen; /* ancillary data buffer length */
struct kiocb *msg_iocb; /* ptr to iocb for async requests */
struct ubuf_info *msg_ubuf;
- int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
+ int (*sg_from_iter)(struct sk_buff *skb,
struct iov_iter *from, size_t length);
};
@@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
extern int __sys_socket(int family, int type, int protocol);
extern struct file *__sys_socket_file(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
+extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
+ int addrlen);
extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
int addrlen, int file_flags);
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
int addrlen);
extern int __sys_listen(int fd, int backlog);
+extern int __sys_listen_socket(struct socket *sock, int backlog);
extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len);
extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h
index 45e4e64fd664..753971770733 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/fs.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h
@@ -329,12 +329,17 @@ typedef int __bitwise __kernel_rwf_t;
/* per-IO negation of O_APPEND */
#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020)
+/* Atomic Write */
+#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040)
+
/* mask of flags supported by the kernel */
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
- RWF_APPEND | RWF_NOAPPEND)
+ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC)
+
+#define PROCFS_IOCTL_MAGIC 'f'
/* Pagemap ioctl */
-#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)
+#define PAGEMAP_SCAN _IOWR(PROCFS_IOCTL_MAGIC, 16, struct pm_scan_arg)
/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
#define PAGE_IS_WPALLOWED (1 << 0)
@@ -393,4 +398,158 @@ struct pm_scan_arg {
__u64 return_mask;
};
+/* /proc/<pid>/maps ioctl */
+#define PROCMAP_QUERY _IOWR(PROCFS_IOCTL_MAGIC, 17, struct procmap_query)
+
+enum procmap_query_flags {
+ /*
+ * VMA permission flags.
+ *
+ * Can be used as part of procmap_query.query_flags field to look up
+ * only VMAs satisfying specified subset of permissions. E.g., specifying
+ * PROCMAP_QUERY_VMA_READABLE only will return both readable and read/write VMAs,
+ * while having PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_WRITABLE will only
+ * return read/write VMAs, though both executable/non-executable and
+ * private/shared will be ignored.
+ *
+ * PROCMAP_QUERY_VMA_* flags are also returned in procmap_query.vma_flags
+ * field to specify actual VMA permissions.
+ */
+ PROCMAP_QUERY_VMA_READABLE = 0x01,
+ PROCMAP_QUERY_VMA_WRITABLE = 0x02,
+ PROCMAP_QUERY_VMA_EXECUTABLE = 0x04,
+ PROCMAP_QUERY_VMA_SHARED = 0x08,
+ /*
+ * Query modifier flags.
+ *
+ * By default VMA that covers provided address is returned, or -ENOENT
+ * is returned. With PROCMAP_QUERY_COVERING_OR_NEXT_VMA flag set, closest
+ * VMA with vma_start > addr will be returned if no covering VMA is
+ * found.
+ *
+ * PROCMAP_QUERY_FILE_BACKED_VMA instructs query to consider only VMAs that
+ * have file backing. Can be combined with PROCMAP_QUERY_COVERING_OR_NEXT_VMA
+ * to iterate all VMAs with file backing.
+ */
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA = 0x10,
+ PROCMAP_QUERY_FILE_BACKED_VMA = 0x20,
+};
+
+/*
+ * Input/output argument structured passed into ioctl() call. It can be used
+ * to query a set of VMAs (Virtual Memory Areas) of a process.
+ *
+ * Each field can be one of three kinds, marked in a short comment to the
+ * right of the field:
+ * - "in", input argument, user has to provide this value, kernel doesn't modify it;
+ * - "out", output argument, kernel sets this field with VMA data;
+ * - "in/out", input and output argument; user provides initial value (used
+ * to specify maximum allowable buffer size), and kernel sets it to actual
+ * amount of data written (or zero, if there is no data).
+ *
+ * If matching VMA is found (according to criterias specified by
+ * query_addr/query_flags, all the out fields are filled out, and ioctl()
+ * returns 0. If there is no matching VMA, -ENOENT will be returned.
+ * In case of any other error, negative error code other than -ENOENT is
+ * returned.
+ *
+ * Most of the data is similar to the one returned as text in /proc/<pid>/maps
+ * file, but procmap_query provides more querying flexibility. There are no
+ * consistency guarantees between subsequent ioctl() calls, but data returned
+ * for matched VMA is self-consistent.
+ */
+struct procmap_query {
+ /* Query struct size, for backwards/forward compatibility */
+ __u64 size;
+ /*
+ * Query flags, a combination of enum procmap_query_flags values.
+ * Defines query filtering and behavior, see enum procmap_query_flags.
+ *
+ * Input argument, provided by user. Kernel doesn't modify it.
+ */
+ __u64 query_flags; /* in */
+ /*
+ * Query address. By default, VMA that covers this address will
+ * be looked up. PROCMAP_QUERY_* flags above modify this default
+ * behavior further.
+ *
+ * Input argument, provided by user. Kernel doesn't modify it.
+ */
+ __u64 query_addr; /* in */
+ /* VMA starting (inclusive) and ending (exclusive) address, if VMA is found. */
+ __u64 vma_start; /* out */
+ __u64 vma_end; /* out */
+ /* VMA permissions flags. A combination of PROCMAP_QUERY_VMA_* flags. */
+ __u64 vma_flags; /* out */
+ /* VMA backing page size granularity. */
+ __u64 vma_page_size; /* out */
+ /*
+ * VMA file offset. If VMA has file backing, this specifies offset
+ * within the file that VMA's start address corresponds to.
+ * Is set to zero if VMA has no backing file.
+ */
+ __u64 vma_offset; /* out */
+ /* Backing file's inode number, or zero, if VMA has no backing file. */
+ __u64 inode; /* out */
+ /* Backing file's device major/minor number, or zero, if VMA has no backing file. */
+ __u32 dev_major; /* out */
+ __u32 dev_minor; /* out */
+ /*
+ * If set to non-zero value, signals the request to return VMA name
+ * (i.e., VMA's backing file's absolute path, with " (deleted)" suffix
+ * appended, if file was unlinked from FS) for matched VMA. VMA name
+ * can also be some special name (e.g., "[heap]", "[stack]") or could
+ * be even user-supplied with prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME).
+ *
+ * Kernel will set this field to zero, if VMA has no associated name.
+ * Otherwise kernel will return actual amount of bytes filled in
+ * user-supplied buffer (see vma_name_addr field below), including the
+ * terminating zero.
+ *
+ * If VMA name is longer that user-supplied maximum buffer size,
+ * -E2BIG error is returned.
+ *
+ * If this field is set to non-zero value, vma_name_addr should point
+ * to valid user space memory buffer of at least vma_name_size bytes.
+ * If set to zero, vma_name_addr should be set to zero as well
+ */
+ __u32 vma_name_size; /* in/out */
+ /*
+ * If set to non-zero value, signals the request to extract and return
+ * VMA's backing file's build ID, if the backing file is an ELF file
+ * and it contains embedded build ID.
+ *
+ * Kernel will set this field to zero, if VMA has no backing file,
+ * backing file is not an ELF file, or ELF file has no build ID
+ * embedded.
+ *
+ * Build ID is a binary value (not a string). Kernel will set
+ * build_id_size field to exact number of bytes used for build ID.
+ * If build ID is requested and present, but needs more bytes than
+ * user-supplied maximum buffer size (see build_id_addr field below),
+ * -E2BIG error will be returned.
+ *
+ * If this field is set to non-zero value, build_id_addr should point
+ * to valid user space memory buffer of at least build_id_size bytes.
+ * If set to zero, build_id_addr should be set to zero as well
+ */
+ __u32 build_id_size; /* in/out */
+ /*
+ * User-supplied address of a buffer of at least vma_name_size bytes
+ * for kernel to fill with matched VMA's name (see vma_name_size field
+ * description above for details).
+ *
+ * Should be set to zero if VMA name should not be returned.
+ */
+ __u64 vma_name_addr; /* in */
+ /*
+ * User-supplied address of a buffer of at least build_id_size bytes
+ * for kernel to fill with matched VMA's ELF build ID, if available
+ * (see build_id_size field description above for details).
+ *
+ * Should be set to zero if build ID should not be returned.
+ */
+ __u64 build_id_addr; /* in */
+};
+
#endif /* _UAPI_LINUX_FS_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/mount.h b/tools/perf/trace/beauty/include/uapi/linux/mount.h
index ad5478dbad00..225bc366ffcb 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/mount.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/mount.h
@@ -154,7 +154,7 @@ struct mount_attr {
*/
struct statmount {
__u32 size; /* Total size, including strings */
- __u32 __spare1;
+ __u32 mnt_opts; /* [str] Mount options of the mount */
__u64 mask; /* What results were written */
__u32 sb_dev_major; /* Device ID */
__u32 sb_dev_minor;
@@ -172,7 +172,8 @@ struct statmount {
__u64 propagate_from; /* Propagation from in current namespace */
__u32 mnt_root; /* [str] Root of mount relative to root of fs */
__u32 mnt_point; /* [str] Mountpoint relative to current root */
- __u64 __spare2[50];
+ __u64 mnt_ns_id; /* ID of the mount namespace */
+ __u64 __spare2[49];
char str[]; /* Variable size part containing strings */
};
@@ -188,10 +189,12 @@ struct mnt_id_req {
__u32 spare;
__u64 mnt_id;
__u64 param;
+ __u64 mnt_ns_id;
};
/* List of all mnt_id_req versions. */
#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */
+#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */
/*
* @mask bits for statmount(2)
@@ -202,10 +205,13 @@ struct mnt_id_req {
#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */
#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
+#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
+#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
/*
* Special @mnt_id values that can be passed to listmount
*/
#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */
#endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h
index 67626d535316..887a25286441 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/stat.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h
@@ -126,9 +126,15 @@ struct statx {
__u64 stx_mnt_id;
__u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */
__u32 stx_dio_offset_align; /* File offset alignment for direct I/O */
- __u64 stx_subvol; /* Subvolume identifier */
/* 0xa0 */
- __u64 __spare3[11]; /* Spare space for future expansion */
+ __u64 stx_subvol; /* Subvolume identifier */
+ __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */
+ __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */
+ /* 0xb0 */
+ __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */
+ __u32 __spare1[1];
+ /* 0xb8 */
+ __u64 __spare3[9]; /* Spare space for future expansion */
/* 0x100 */
};
@@ -157,6 +163,7 @@ struct statx {
#define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */
#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */
+#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
@@ -192,6 +199,7 @@ struct statx {
#define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */
#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */
#define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */
+#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */
#endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/perf/trace/beauty/include/uapi/sound/asound.h b/tools/perf/trace/beauty/include/uapi/sound/asound.h
index 628d46a0da92..8bf7e8a0eb6f 100644
--- a/tools/perf/trace/beauty/include/uapi/sound/asound.h
+++ b/tools/perf/trace/beauty/include/uapi/sound/asound.h
@@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image {
* *
*****************************************************************************/
-#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17)
+#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 18)
typedef unsigned long snd_pcm_uframes_t;
typedef signed long snd_pcm_sframes_t;
@@ -334,7 +334,7 @@ union snd_pcm_sync_id {
unsigned char id[16];
unsigned short id16[8];
unsigned int id32[4];
-};
+} __attribute__((deprecated));
struct snd_pcm_info {
unsigned int device; /* RO/WR (control): device number */
@@ -348,7 +348,7 @@ struct snd_pcm_info {
int dev_subclass; /* SNDRV_PCM_SUBCLASS_* */
unsigned int subdevices_count;
unsigned int subdevices_avail;
- union snd_pcm_sync_id sync; /* hardware synchronization ID */
+ unsigned char pad1[16]; /* was: hardware synchronization ID */
unsigned char reserved[64]; /* reserved for future... */
};
@@ -420,7 +420,8 @@ struct snd_pcm_hw_params {
unsigned int rate_num; /* R: rate numerator */
unsigned int rate_den; /* R: rate denominator */
snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */
- unsigned char reserved[64]; /* reserved for future */
+ unsigned char sync[16]; /* R: synchronization ID (perfect sync - one clock source) */
+ unsigned char reserved[48]; /* reserved for future */
};
enum {
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 1730b852a947..6d075648d2cc 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1141,7 +1141,7 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp
int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node,
bool hide_unresolved)
{
- struct machine *machine = maps__machine(node->ms.maps);
+ struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL;
maps__put(al->maps);
al->maps = maps__get(node->ms.maps);
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 2340c4f6d0c2..67414944f245 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1501,7 +1501,7 @@ void dso__delete(struct dso *dso)
auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache);
dso_cache__free(dso);
dso__free_a2l(dso);
- zfree(&RC_CHK_ACCESS(dso)->symsrc_filename);
+ dso__free_symsrc_filename(dso);
nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo);
mutex_destroy(dso__lock(dso));
RC_CHK_FREE(dso);
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index 878c1f441868..ed0068251c65 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -602,6 +602,11 @@ static inline void dso__set_symsrc_filename(struct dso *dso, char *val)
RC_CHK_ACCESS(dso)->symsrc_filename = val;
}
+static inline void dso__free_symsrc_filename(struct dso *dso)
+{
+ zfree(&RC_CHK_ACCESS(dso)->symsrc_filename);
+}
+
static inline enum dso_binary_type dso__symtab_type(const struct dso *dso)
{
return RC_CHK_ACCESS(dso)->symtab_type;
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index f6a6f6a91030..16c2b03831f3 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -413,7 +413,7 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
__func__,
dso__symsrc_filename(dso),
debuglink);
- zfree(&dso__symsrc_filename(dso));
+ dso__free_symsrc_filename(dso);
}
dso__set_symsrc_filename(dso, debuglink);
} else {
diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
index b1e6817f1e54..3946d5254a1f 100644
--- a/tools/power/x86/turbostat/Makefile
+++ b/tools/power/x86/turbostat/Makefile
@@ -46,6 +46,7 @@ snapshot: turbostat
@echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h
@echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h
+ @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h
@echo PWD=. > $(SNAPSHOT)/Makefile
@echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 8d37acd39201..067717bce1d4 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -28,10 +28,13 @@ name as necessary to disambiguate it from others is necessary. Note that option
.PP
\fB--add attributes\fP add column with counter having specified 'attributes'. The 'location' attribute is required, all others are optional.
.nf
- location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP}
+ location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP | \fBperf/<device>/<event>\fP}
msrDDD is a decimal offset, eg. msr16
msr0xXXX is a hex offset, eg. msr0x10
/sys/path... is an absolute path to a sysfs attribute
+ <device> is a perf device from /sys/bus/event_source/devices/<device> eg. cstate_core
+ <event> is a perf event for given device from /sys/bus/event_source/devices/<device>/events/<event> eg. c1-residency
+ perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency
scope: {\fBcpu\fP | \fBcore\fP | \fBpackage\fP}
sample and print the counter for every cpu, core, or package.
@@ -52,6 +55,39 @@ name as necessary to disambiguate it from others is necessary. Note that option
as the column header.
.fi
.PP
+\fB--add pmt,[attr_name=attr_value, ...]\fP add column with a PMT (Intel Platform Monitoring Technology) counter in a similar way to --add option above, but require PMT metadata to be supplied to correctly read and display the counter. The metadata can be found in the Intel PMT XML files, hosted at https://github.com/intel/Intel-PMT. For a complete example see "ADD PMT COUNTER EXAMPLE".
+.nf
+ name="name_string"
+ For column header.
+
+ type={\fBraw\fP}
+ 'raw' shows the counter contents in hex.
+ default: raw
+
+ format={\fBraw\fP | \fBdelta\fP}
+ 'raw' shows the counter contents in hex.
+ 'delta' shows the difference in values during the measurement interval.
+ default: raw
+
+ domain={\fBcpu%u\fP | \fBcore%u\fP | \fBpackage%u\fP}
+ 'cpu' per cpu/thread counter.
+ 'core' per core counter.
+ 'package' per package counter.
+ '%u' denotes id of the domain that the counter is associated with. For example core4 would mean that the counter is associated with core number 4.
+
+ offset=\fB%u\fP
+ '%u' offset within the PMT MMIO region.
+
+ lsb=\fB%u\fP
+ '%u' least significant bit within the 64 bit value read from 'offset'. Together with 'msb', used to form a read mask.
+
+ msb=\fB%u\fP
+ '%u' most significant bit within the 64 bit value read from 'offset'. Together with 'lsb', used to form a read mask.
+
+ guid=\fB%x\fP
+ '%x' hex identifier of the PMT MMIO region.
+.fi
+.PP
\fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44
.PP
\fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names.
@@ -67,10 +103,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
.PP
\fB--quiet\fP Do not decode and print the system configuration header information.
.PP
-+\fB--no-msr\fP Disable all the uses of the MSR driver.
-+.PP
-+\fB--no-perf\fP Disable all the uses of the perf API.
-+.PP
+\fB--no-msr\fP Disable all the uses of the MSR driver.
+.PP
+\fB--no-perf\fP Disable all the uses of the perf API.
+.PP
\fB--interval seconds\fP overrides the default 5.0 second measurement interval.
.PP
\fB--num_iterations num\fP number of the measurement iterations.
@@ -320,7 +356,7 @@ available on all processors.
Here we limit turbostat to showing just the CPU number for cpu0 - cpu3.
We add a counter showing the 32-bit raw value of MSR 0x199 (MSR_IA32_PERF_CTL),
labeling it with the column header, "PRF_CTRL", and display it only once,
-afte the conclusion of a 0.1 second sleep.
+after the conclusion of a 0.1 second sleep.
.nf
sudo ./turbostat --quiet --cpu 0-3 --show CPU --add msr0x199,u32,raw,PRF_CTRL sleep .1
0.101604 sec
@@ -333,6 +369,56 @@ CPU PRF_CTRL
.fi
+.SH ADD PERF COUNTER EXAMPLE
+Here we limit turbostat to showing just the CPU number for cpu0 - cpu3.
+We add a counter showing time spent in C1 core cstate,
+labeling it with the column header, "pCPU%c1", and display it only once,
+after the conclusion of 0.1 second sleep.
+We also show CPU%c1 built-in counter that should show similar values.
+.nf
+sudo ./turbostat --quiet --cpu 0-3 --show CPU,CPU%c1 --add perf/cstate_core/c1-residency,cpu,delta,percent,pCPU%c1 sleep .1
+0.102448 sec
+CPU pCPU%c1 CPU%c1
+- 34.89 34.89
+0 45.99 45.99
+1 45.94 45.94
+2 23.83 23.83
+3 23.84 23.84
+
+.fi
+
+.SH ADD PMT COUNTER EXAMPLE
+Here we limit turbostat to showing just the CPU number 0.
+We add two counters, showing crystal clock count and the DC6 residency.
+All the parameters passed are based on the metadata found in the PMT XML files.
+
+For the crystal clock count, we
+label it with the column header, "XTAL",
+we set the type to 'raw', to read the number of clock ticks in hex,
+we set the format to 'delta', to display the difference in ticks during the measurement interval,
+we set the domain to 'package0', to collect it and associate it with the whole package number 0,
+we set the offset to '0', which is a offset of the counter within the PMT MMIO region,
+we set the lsb and msb to cover all 64 bits of the read 64 bit value,
+and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value.
+
+For the DC6 residency counter, we
+label it with the column header, "Die%c6",
+we set the type to 'txtal_time', to obtain the percent residency value
+we set the format to 'delta', to display the difference in ticks during the measurement interval,
+we set the domain to 'package0', to collect it and associate it with the whole package number 0,
+we set the offset to '0', which is a offset of the counter within the PMT MMIO region,
+we set the lsb and msb to cover all 64 bits of the read 64 bit value,
+and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value.
+
+.nf
+sudo ./turbostat --quiet --cpu 0 --show CPU --add pmt,name=XTAL,type=raw,format=delta,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102 --add pmt,name=Die%c6,type=txtal_time,format=delta,domain=package0,offset=120,lsb=0,msb=63,guid=0x1a067102
+0.104352 sec
+CPU XTAL Die%c6
+- 0x0000006d4d957ca7 0.00
+0 0x0000006d4d957ca7 0.00
+0.102448 sec
+.fi
+
.SH INPUT
For interval-mode, turbostat will immediately end the current interval
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9f5d053d4bc6..089220aaa5c9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -9,6 +9,30 @@
#define _GNU_SOURCE
#include MSRHEADER
+
+// copied from arch/x86/include/asm/cpu_device_id.h
+#define VFM_MODEL_BIT 0
+#define VFM_FAMILY_BIT 8
+#define VFM_VENDOR_BIT 16
+#define VFM_RSVD_BIT 24
+
+#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
+#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
+#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
+
+#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
+#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
+#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
+
+#define VFM_MAKE(_vendor, _family, _model) ( \
+ ((_model) << VFM_MODEL_BIT) | \
+ ((_family) << VFM_FAMILY_BIT) | \
+ ((_vendor) << VFM_VENDOR_BIT) \
+)
+// end copied section
+
+#define X86_VENDOR_INTEL 0
+
#include INTEL_FAMILY_HEADER
#include BUILD_BUG_HEADER
#include <stdarg.h>
@@ -20,6 +44,7 @@
#include <sys/stat.h>
#include <sys/select.h>
#include <sys/resource.h>
+#include <sys/mman.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/time.h>
@@ -55,15 +80,39 @@
*/
#define NAME_BYTES 20
#define PATH_BYTES 128
+#define PERF_NAME_BYTES 128
#define MAX_NOFILE 0x8000
+#define COUNTER_KIND_PERF_PREFIX "perf/"
+#define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX)
+#define PERF_DEV_NAME_BYTES 32
+#define PERF_EVT_NAME_BYTES 32
+
enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
-enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR };
-enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR };
-enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR };
+enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR };
+
+struct perf_counter_info {
+ struct perf_counter_info *next;
+
+ /* How to open the counter / What counter it is. */
+ char device[PERF_DEV_NAME_BYTES];
+ char event[PERF_EVT_NAME_BYTES];
+
+ /* How to show/format the counter. */
+ char name[PERF_NAME_BYTES];
+ unsigned int width;
+ enum counter_scope scope;
+ enum counter_type type;
+ enum counter_format format;
+ double scale;
+
+ /* For reading the counter. */
+ int *fd_perf_per_domain;
+ size_t num_domains;
+};
struct sysfs_path {
char path[PATH_BYTES];
@@ -144,6 +193,7 @@ struct msr_counter bic[] = {
{ 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 },
{ 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 },
+ { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 },
};
#define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@@ -205,11 +255,12 @@ struct msr_counter bic[] = {
#define BIC_SAM_mc6 (1ULL << 55)
#define BIC_SAMMHz (1ULL << 56)
#define BIC_SAMACTMHz (1ULL << 57)
+#define BIC_Diec6 (1ULL << 58)
#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
#define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
-#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6)
+#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
@@ -252,7 +303,6 @@ char *proc_stat = "/proc/stat";
FILE *outf;
int *fd_percpu;
int *fd_instr_count_percpu;
-struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */
struct timeval interval_tv = { 5, 0 };
struct timespec interval_ts = { 5, 0 };
@@ -267,6 +317,7 @@ unsigned int summary_only;
unsigned int list_header_only;
unsigned int dump_only;
unsigned int has_aperf;
+unsigned int has_aperf_access;
unsigned int has_epb;
unsigned int has_turbo;
unsigned int is_hybrid;
@@ -307,7 +358,6 @@ unsigned int first_counter_read = 1;
int ignore_stdin;
bool no_msr;
bool no_perf;
-enum amperf_source amperf_source;
enum gfx_sysfs_idx {
GFX_rc6,
@@ -367,7 +417,7 @@ struct platform_features {
};
struct platform_data {
- unsigned int model;
+ unsigned int vfm;
const struct platform_features *features;
};
@@ -910,75 +960,75 @@ static const struct platform_features amd_features_with_rapl = {
};
static const struct platform_data turbostat_pdata[] = {
- { INTEL_FAM6_NEHALEM, &nhm_features },
- { INTEL_FAM6_NEHALEM_G, &nhm_features },
- { INTEL_FAM6_NEHALEM_EP, &nhm_features },
- { INTEL_FAM6_NEHALEM_EX, &nhx_features },
- { INTEL_FAM6_WESTMERE, &nhm_features },
- { INTEL_FAM6_WESTMERE_EP, &nhm_features },
- { INTEL_FAM6_WESTMERE_EX, &nhx_features },
- { INTEL_FAM6_SANDYBRIDGE, &snb_features },
- { INTEL_FAM6_SANDYBRIDGE_X, &snx_features },
- { INTEL_FAM6_IVYBRIDGE, &ivb_features },
- { INTEL_FAM6_IVYBRIDGE_X, &ivx_features },
- { INTEL_FAM6_HASWELL, &hsw_features },
- { INTEL_FAM6_HASWELL_X, &hsx_features },
- { INTEL_FAM6_HASWELL_L, &hswl_features },
- { INTEL_FAM6_HASWELL_G, &hswg_features },
- { INTEL_FAM6_BROADWELL, &bdw_features },
- { INTEL_FAM6_BROADWELL_G, &bdwg_features },
- { INTEL_FAM6_BROADWELL_X, &bdx_features },
- { INTEL_FAM6_BROADWELL_D, &bdx_features },
- { INTEL_FAM6_SKYLAKE_L, &skl_features },
- { INTEL_FAM6_SKYLAKE, &skl_features },
- { INTEL_FAM6_SKYLAKE_X, &skx_features },
- { INTEL_FAM6_KABYLAKE_L, &skl_features },
- { INTEL_FAM6_KABYLAKE, &skl_features },
- { INTEL_FAM6_COMETLAKE, &skl_features },
- { INTEL_FAM6_COMETLAKE_L, &skl_features },
- { INTEL_FAM6_CANNONLAKE_L, &cnl_features },
- { INTEL_FAM6_ICELAKE_X, &icx_features },
- { INTEL_FAM6_ICELAKE_D, &icx_features },
- { INTEL_FAM6_ICELAKE_L, &cnl_features },
- { INTEL_FAM6_ICELAKE_NNPI, &cnl_features },
- { INTEL_FAM6_ROCKETLAKE, &cnl_features },
- { INTEL_FAM6_TIGERLAKE_L, &cnl_features },
- { INTEL_FAM6_TIGERLAKE, &cnl_features },
- { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
- { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
- { INTEL_FAM6_GRANITERAPIDS_X, &spr_features },
- { INTEL_FAM6_LAKEFIELD, &cnl_features },
- { INTEL_FAM6_ALDERLAKE, &adl_features },
- { INTEL_FAM6_ALDERLAKE_L, &adl_features },
- { INTEL_FAM6_RAPTORLAKE, &adl_features },
- { INTEL_FAM6_RAPTORLAKE_P, &adl_features },
- { INTEL_FAM6_RAPTORLAKE_S, &adl_features },
- { INTEL_FAM6_METEORLAKE, &cnl_features },
- { INTEL_FAM6_METEORLAKE_L, &cnl_features },
- { INTEL_FAM6_ARROWLAKE_H, &arl_features },
- { INTEL_FAM6_ARROWLAKE_U, &arl_features },
- { INTEL_FAM6_ARROWLAKE, &arl_features },
- { INTEL_FAM6_LUNARLAKE_M, &arl_features },
- { INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
- { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
- { INTEL_FAM6_ATOM_AIRMONT, &amt_features },
- { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features },
- { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features },
- { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features },
- { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
- { INTEL_FAM6_ATOM_TREMONT, &tmt_features },
- { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
- { INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
- { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
- { INTEL_FAM6_ATOM_CRESTMONT, &grr_features },
- { INTEL_FAM6_XEON_PHI_KNL, &knl_features },
- { INTEL_FAM6_XEON_PHI_KNM, &knl_features },
+ { INTEL_NEHALEM, &nhm_features },
+ { INTEL_NEHALEM_G, &nhm_features },
+ { INTEL_NEHALEM_EP, &nhm_features },
+ { INTEL_NEHALEM_EX, &nhx_features },
+ { INTEL_WESTMERE, &nhm_features },
+ { INTEL_WESTMERE_EP, &nhm_features },
+ { INTEL_WESTMERE_EX, &nhx_features },
+ { INTEL_SANDYBRIDGE, &snb_features },
+ { INTEL_SANDYBRIDGE_X, &snx_features },
+ { INTEL_IVYBRIDGE, &ivb_features },
+ { INTEL_IVYBRIDGE_X, &ivx_features },
+ { INTEL_HASWELL, &hsw_features },
+ { INTEL_HASWELL_X, &hsx_features },
+ { INTEL_HASWELL_L, &hswl_features },
+ { INTEL_HASWELL_G, &hswg_features },
+ { INTEL_BROADWELL, &bdw_features },
+ { INTEL_BROADWELL_G, &bdwg_features },
+ { INTEL_BROADWELL_X, &bdx_features },
+ { INTEL_BROADWELL_D, &bdx_features },
+ { INTEL_SKYLAKE_L, &skl_features },
+ { INTEL_SKYLAKE, &skl_features },
+ { INTEL_SKYLAKE_X, &skx_features },
+ { INTEL_KABYLAKE_L, &skl_features },
+ { INTEL_KABYLAKE, &skl_features },
+ { INTEL_COMETLAKE, &skl_features },
+ { INTEL_COMETLAKE_L, &skl_features },
+ { INTEL_CANNONLAKE_L, &cnl_features },
+ { INTEL_ICELAKE_X, &icx_features },
+ { INTEL_ICELAKE_D, &icx_features },
+ { INTEL_ICELAKE_L, &cnl_features },
+ { INTEL_ICELAKE_NNPI, &cnl_features },
+ { INTEL_ROCKETLAKE, &cnl_features },
+ { INTEL_TIGERLAKE_L, &cnl_features },
+ { INTEL_TIGERLAKE, &cnl_features },
+ { INTEL_SAPPHIRERAPIDS_X, &spr_features },
+ { INTEL_EMERALDRAPIDS_X, &spr_features },
+ { INTEL_GRANITERAPIDS_X, &spr_features },
+ { INTEL_LAKEFIELD, &cnl_features },
+ { INTEL_ALDERLAKE, &adl_features },
+ { INTEL_ALDERLAKE_L, &adl_features },
+ { INTEL_RAPTORLAKE, &adl_features },
+ { INTEL_RAPTORLAKE_P, &adl_features },
+ { INTEL_RAPTORLAKE_S, &adl_features },
+ { INTEL_METEORLAKE, &cnl_features },
+ { INTEL_METEORLAKE_L, &cnl_features },
+ { INTEL_ARROWLAKE_H, &arl_features },
+ { INTEL_ARROWLAKE_U, &arl_features },
+ { INTEL_ARROWLAKE, &arl_features },
+ { INTEL_LUNARLAKE_M, &arl_features },
+ { INTEL_ATOM_SILVERMONT, &slv_features },
+ { INTEL_ATOM_SILVERMONT_D, &slvd_features },
+ { INTEL_ATOM_AIRMONT, &amt_features },
+ { INTEL_ATOM_GOLDMONT, &gmt_features },
+ { INTEL_ATOM_GOLDMONT_D, &gmtd_features },
+ { INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features },
+ { INTEL_ATOM_TREMONT_D, &tmtd_features },
+ { INTEL_ATOM_TREMONT, &tmt_features },
+ { INTEL_ATOM_TREMONT_L, &tmt_features },
+ { INTEL_ATOM_GRACEMONT, &adl_features },
+ { INTEL_ATOM_CRESTMONT_X, &srf_features },
+ { INTEL_ATOM_CRESTMONT, &grr_features },
+ { INTEL_XEON_PHI_KNL, &knl_features },
+ { INTEL_XEON_PHI_KNM, &knl_features },
/*
* Missing support for
- * INTEL_FAM6_ICELAKE
- * INTEL_FAM6_ATOM_SILVERMONT_MID
- * INTEL_FAM6_ATOM_AIRMONT_MID
- * INTEL_FAM6_ATOM_AIRMONT_NP
+ * INTEL_ICELAKE
+ * INTEL_ATOM_SILVERMONT_MID
+ * INTEL_ATOM_AIRMONT_MID
+ * INTEL_ATOM_AIRMONT_NP
*/
{ 0, NULL },
};
@@ -1003,11 +1053,11 @@ void probe_platform_features(unsigned int family, unsigned int model)
return;
}
- if (!genuine_intel || family != 6)
+ if (!genuine_intel)
return;
for (i = 0; turbostat_pdata[i].features; i++) {
- if (turbostat_pdata[i].model == model) {
+ if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) {
platform = turbostat_pdata[i].features;
return;
}
@@ -1034,8 +1084,13 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi
#define MAX_ADDED_THREAD_COUNTERS 24
#define MAX_ADDED_CORE_COUNTERS 8
#define MAX_ADDED_PACKAGE_COUNTERS 16
+#define PMT_MAX_ADDED_THREAD_COUNTERS 24
+#define PMT_MAX_ADDED_CORE_COUNTERS 8
+#define PMT_MAX_ADDED_PACKAGE_COUNTERS 16
#define BITMASK_SIZE 32
+#define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr))
+
/* Indexes used to map data read from perf and MSRs into global variables */
enum rapl_rci_index {
RAPL_RCI_INDEX_ENERGY_PKG = 0,
@@ -1056,19 +1111,13 @@ enum rapl_unit {
struct rapl_counter_info_t {
unsigned long long data[NUM_RAPL_COUNTERS];
- enum rapl_source source[NUM_RAPL_COUNTERS];
+ enum counter_source source[NUM_RAPL_COUNTERS];
unsigned long long flags[NUM_RAPL_COUNTERS];
double scale[NUM_RAPL_COUNTERS];
enum rapl_unit unit[NUM_RAPL_COUNTERS];
-
- union {
- /* Active when source == RAPL_SOURCE_MSR */
- struct {
- unsigned long long msr[NUM_RAPL_COUNTERS];
- unsigned long long msr_mask[NUM_RAPL_COUNTERS];
- int msr_shift[NUM_RAPL_COUNTERS];
- };
- };
+ unsigned long long msr[NUM_RAPL_COUNTERS];
+ unsigned long long msr_mask[NUM_RAPL_COUNTERS];
+ int msr_shift[NUM_RAPL_COUNTERS];
int fd_perf;
};
@@ -1224,7 +1273,7 @@ enum ccstate_rci_index {
struct cstate_counter_info_t {
unsigned long long data[NUM_CSTATE_COUNTERS];
- enum cstate_source source[NUM_CSTATE_COUNTERS];
+ enum counter_source source[NUM_CSTATE_COUNTERS];
unsigned long long msr[NUM_CSTATE_COUNTERS];
int fd_perf_core;
int fd_perf_pkg;
@@ -1361,6 +1410,167 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
},
};
+/* Indexes used to map data read from perf and MSRs into global variables */
+enum msr_rci_index {
+ MSR_RCI_INDEX_APERF = 0,
+ MSR_RCI_INDEX_MPERF = 1,
+ MSR_RCI_INDEX_SMI = 2,
+ NUM_MSR_COUNTERS,
+};
+
+struct msr_counter_info_t {
+ unsigned long long data[NUM_MSR_COUNTERS];
+ enum counter_source source[NUM_MSR_COUNTERS];
+ unsigned long long msr[NUM_MSR_COUNTERS];
+ unsigned long long msr_mask[NUM_MSR_COUNTERS];
+ int fd_perf;
+};
+
+struct msr_counter_info_t *msr_counter_info;
+unsigned int msr_counter_info_size;
+
+struct msr_counter_arch_info {
+ const char *perf_subsys;
+ const char *perf_name;
+ unsigned long long msr;
+ unsigned long long msr_mask;
+ unsigned int rci_index; /* Maps data from perf counters to global variables */
+ bool needed;
+ bool present;
+};
+
+enum msr_arch_info_index {
+ MSR_ARCH_INFO_APERF_INDEX = 0,
+ MSR_ARCH_INFO_MPERF_INDEX = 1,
+ MSR_ARCH_INFO_SMI_INDEX = 2,
+};
+
+static struct msr_counter_arch_info msr_counter_arch_infos[] = {
+ [MSR_ARCH_INFO_APERF_INDEX] = {
+ .perf_subsys = "msr",
+ .perf_name = "aperf",
+ .msr = MSR_IA32_APERF,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .rci_index = MSR_RCI_INDEX_APERF,
+ },
+
+ [MSR_ARCH_INFO_MPERF_INDEX] = {
+ .perf_subsys = "msr",
+ .perf_name = "mperf",
+ .msr = MSR_IA32_MPERF,
+ .msr_mask = 0xFFFFFFFFFFFFFFFF,
+ .rci_index = MSR_RCI_INDEX_MPERF,
+ },
+
+ [MSR_ARCH_INFO_SMI_INDEX] = {
+ .perf_subsys = "msr",
+ .perf_name = "smi",
+ .msr = MSR_SMI_COUNT,
+ .msr_mask = 0xFFFFFFFF,
+ .rci_index = MSR_RCI_INDEX_SMI,
+ },
+};
+
+/* Can be redefined when compiling, useful for testing. */
+#ifndef SYSFS_TELEM_PATH
+#define SYSFS_TELEM_PATH "/sys/class/intel_pmt"
+#endif
+
+#define PMT_COUNTER_MTL_DC6_OFFSET 120
+#define PMT_COUNTER_MTL_DC6_LSB 0
+#define PMT_COUNTER_MTL_DC6_MSB 63
+#define PMT_MTL_DC6_GUID 0x1a067102
+
+#define PMT_COUNTER_NAME_SIZE_BYTES 16
+#define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
+
+struct pmt_mmio {
+ struct pmt_mmio *next;
+
+ unsigned int guid;
+ unsigned int size;
+
+ /* Base pointer to the mmaped memory. */
+ void *mmio_base;
+
+ /*
+ * Offset to be applied to the mmio_base
+ * to get the beginning of the PMT counters for given GUID.
+ */
+ unsigned long pmt_offset;
+} *pmt_mmios;
+
+enum pmt_datatype {
+ PMT_TYPE_RAW,
+ PMT_TYPE_XTAL_TIME,
+};
+
+struct pmt_domain_info {
+ /*
+ * Pointer to the MMIO obtained by applying a counter offset
+ * to the mmio_base of the mmaped region for the given GUID.
+ *
+ * This is where to read the raw value of the counter from.
+ */
+ unsigned long *pcounter;
+};
+
+struct pmt_counter {
+ struct pmt_counter *next;
+
+ /* PMT metadata */
+ char name[PMT_COUNTER_NAME_SIZE_BYTES];
+ enum pmt_datatype type;
+ enum counter_scope scope;
+ unsigned int lsb;
+ unsigned int msb;
+
+ /* BIC-like metadata */
+ enum counter_format format;
+
+ unsigned int num_domains;
+ struct pmt_domain_info *domains;
+};
+
+unsigned int pmt_counter_get_width(const struct pmt_counter *p)
+{
+ return (p->msb - p->lsb) + 1;
+}
+
+void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size)
+{
+ struct pmt_domain_info *new_mem;
+
+ new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains));
+ if (!new_mem) {
+ fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__);
+ exit(1);
+ }
+
+ /* Zero initialize just allocated memory. */
+ const size_t num_new_domains = new_size - pcounter->num_domains;
+
+ memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains));
+
+ pcounter->num_domains = new_size;
+ pcounter->domains = new_mem;
+}
+
+void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size)
+{
+ /*
+ * Allocate more memory ahead of time.
+ *
+ * Always allocate space for at least 8 elements
+ * and double the size when growing.
+ */
+ if (new_size < 8)
+ new_size = 8;
+ new_size = MAX(new_size, pcounter->num_domains * 2);
+
+ pmt_counter_resize_(pcounter, new_size);
+}
+
struct thread_data {
struct timeval tv_begin;
struct timeval tv_end;
@@ -1378,6 +1588,8 @@ struct thread_data {
unsigned int flags;
bool is_atom;
unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
+ unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS];
+ unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS];
} *thread_even, *thread_odd;
struct core_data {
@@ -1391,6 +1603,8 @@ struct core_data {
unsigned int core_id;
unsigned long long core_throt_cnt;
unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
+ unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
+ unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS];
} *core_even, *core_odd;
struct pkg_data {
@@ -1423,7 +1637,10 @@ struct pkg_data {
struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */
unsigned int pkg_temp_c;
unsigned int uncore_mhz;
+ unsigned long long die_c6;
unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
+ unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
+ unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS];
} *package_even, *package_odd;
#define ODD_COUNTERS thread_odd, core_odd, package_odd
@@ -1558,12 +1775,25 @@ int idx_valid(int idx)
}
struct sys_counters {
+ /* MSR added counters */
unsigned int added_thread_counters;
unsigned int added_core_counters;
unsigned int added_package_counters;
struct msr_counter *tp;
struct msr_counter *cp;
struct msr_counter *pp;
+
+ /* perf added counters */
+ unsigned int added_thread_perf_counters;
+ unsigned int added_core_perf_counters;
+ unsigned int added_package_perf_counters;
+ struct perf_counter_info *perf_tp;
+ struct perf_counter_info *perf_cp;
+ struct perf_counter_info *perf_pp;
+
+ struct pmt_counter *pmt_tp;
+ struct pmt_counter *pmt_cp;
+ struct pmt_counter *pmt_pp;
} sys;
static size_t free_msr_counters_(struct msr_counter **pp)
@@ -1747,7 +1977,7 @@ int get_msr_fd(int cpu)
static void bic_disable_msr_access(void)
{
- const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp |
+ const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp |
BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp;
bic_enabled &= ~bic_msrs;
@@ -1823,6 +2053,23 @@ int probe_msr(int cpu, off_t offset)
return 0;
}
+/* Convert CPU ID to domain ID for given added perf counter. */
+unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu)
+{
+ switch (pc->scope) {
+ case SCOPE_CPU:
+ return cpu;
+
+ case SCOPE_CORE:
+ return cpus[cpu].physical_core_id;
+
+ case SCOPE_PACKAGE:
+ return cpus[cpu].physical_package_id;
+ }
+
+ __builtin_unreachable();
+}
+
#define MAX_DEFERRED 16
char *deferred_add_names[MAX_DEFERRED];
char *deferred_skip_names[MAX_DEFERRED];
@@ -1846,9 +2093,12 @@ void help(void)
"to print statistics, until interrupted.\n"
" -a, --add add a counter\n"
" eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
+ " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n"
+ " eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n"
" -c, --cpu cpu-set limit output to summary plus cpu-set:\n"
" {core | package | j,k,l..m,n-p }\n"
" -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n"
+ " debug messages are printed to stderr\n"
" -D, --Dump displays the raw counter values\n"
" -e, --enable [all | column]\n"
" shows all or the specified disabled column\n"
@@ -1955,6 +2205,8 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
void print_header(char *delim)
{
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
int printed = 0;
if (DO_BIC(BIC_USEC))
@@ -2012,6 +2264,40 @@ void print_header(char *delim)
}
}
+ for (pp = sys.perf_tp; pp; pp = pp->next) {
+
+ if (pp->format == FORMAT_RAW) {
+ if (pp->width == 64)
+ outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
+ else
+ outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
+ } else {
+ if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+ outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
+ else
+ outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
+ }
+ }
+
+ ppmt = sys.pmt_tp;
+ while (ppmt) {
+ switch (ppmt->type) {
+ case PMT_TYPE_RAW:
+ if (pmt_counter_get_width(ppmt) <= 32)
+ outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
+ else
+ outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
+
+ break;
+
+ case PMT_TYPE_XTAL_TIME:
+ outp += sprintf(outp, "%s%s", delim, ppmt->name);
+ break;
+ }
+
+ ppmt = ppmt->next;
+ }
+
if (DO_BIC(BIC_CPU_c1))
outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
if (DO_BIC(BIC_CPU_c3))
@@ -2052,6 +2338,40 @@ void print_header(char *delim)
}
}
+ for (pp = sys.perf_cp; pp; pp = pp->next) {
+
+ if (pp->format == FORMAT_RAW) {
+ if (pp->width == 64)
+ outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
+ else
+ outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
+ } else {
+ if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+ outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
+ else
+ outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
+ }
+ }
+
+ ppmt = sys.pmt_cp;
+ while (ppmt) {
+ switch (ppmt->type) {
+ case PMT_TYPE_RAW:
+ if (pmt_counter_get_width(ppmt) <= 32)
+ outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
+ else
+ outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
+
+ break;
+
+ case PMT_TYPE_XTAL_TIME:
+ outp += sprintf(outp, "%s%s", delim, ppmt->name);
+ break;
+ }
+
+ ppmt = ppmt->next;
+ }
+
if (DO_BIC(BIC_PkgTmp))
outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
@@ -2096,6 +2416,8 @@ void print_header(char *delim)
outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
if (DO_BIC(BIC_Pkgpc10))
outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
+ if (DO_BIC(BIC_Diec6))
+ outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : ""));
if (DO_BIC(BIC_CPU_LPI))
outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
if (DO_BIC(BIC_SYS_LPI))
@@ -2147,6 +2469,40 @@ void print_header(char *delim)
}
}
+ for (pp = sys.perf_pp; pp; pp = pp->next) {
+
+ if (pp->format == FORMAT_RAW) {
+ if (pp->width == 64)
+ outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
+ else
+ outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
+ } else {
+ if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+ outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
+ else
+ outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
+ }
+ }
+
+ ppmt = sys.pmt_pp;
+ while (ppmt) {
+ switch (ppmt->type) {
+ case PMT_TYPE_RAW:
+ if (pmt_counter_get_width(ppmt) <= 32)
+ outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
+ else
+ outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
+
+ break;
+
+ case PMT_TYPE_XTAL_TIME:
+ outp += sprintf(outp, "%s%s", delim, ppmt->name);
+ break;
+ }
+
+ ppmt = ppmt->next;
+ }
+
outp += sprintf(outp, "\n");
}
@@ -2267,6 +2623,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
char *fmt8;
int i;
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
char *delim = "\t";
int printed = 0;
@@ -2404,6 +2762,51 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
}
}
+ /* Added perf counters */
+ for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) {
+ if (pp->format == FORMAT_RAW) {
+ if (pp->width == 32)
+ outp +=
+ sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+ (unsigned int)t->perf_counter[i]);
+ else
+ outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]);
+ } else if (pp->format == FORMAT_DELTA) {
+ if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+ outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]);
+ else
+ outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]);
+ } else if (pp->format == FORMAT_PERCENT) {
+ if (pp->type == COUNTER_USEC)
+ outp +=
+ sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
+ t->perf_counter[i] / interval_float / 10000);
+ else
+ outp +=
+ sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc);
+ }
+ }
+
+ for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+ switch (ppmt->type) {
+ case PMT_TYPE_RAW:
+ if (pmt_counter_get_width(ppmt) <= 32)
+ outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+ (unsigned int)t->pmt_counter[i]);
+ else
+ outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]);
+
+ break;
+
+ case PMT_TYPE_XTAL_TIME:
+ const unsigned long value_raw = t->pmt_counter[i];
+ const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+
+ outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
+ break;
+ }
+ }
+
/* C1 */
if (DO_BIC(BIC_CPU_c1))
outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
@@ -2447,6 +2850,44 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
}
}
+ for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW) {
+ if (pp->width == 32)
+ outp +=
+ sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+ (unsigned int)c->perf_counter[i]);
+ else
+ outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]);
+ } else if (pp->format == FORMAT_DELTA) {
+ if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+ outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]);
+ else
+ outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]);
+ } else if (pp->format == FORMAT_PERCENT) {
+ outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc);
+ }
+ }
+
+ for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+ switch (ppmt->type) {
+ case PMT_TYPE_RAW:
+ if (pmt_counter_get_width(ppmt) <= 32)
+ outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+ (unsigned int)c->pmt_counter[i]);
+ else
+ outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]);
+
+ break;
+
+ case PMT_TYPE_XTAL_TIME:
+ const unsigned long value_raw = c->pmt_counter[i];
+ const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+
+ outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
+ break;
+ }
+ }
+
fmt8 = "%s%.2f";
if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
@@ -2526,6 +2967,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
if (DO_BIC(BIC_Pkgpc10))
outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
+ if (DO_BIC(BIC_Diec6))
+ outp +=
+ sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float);
+
if (DO_BIC(BIC_CPU_LPI)) {
if (p->cpu_lpi >= 0)
outp +=
@@ -2601,6 +3046,47 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000);
}
+ for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW) {
+ if (pp->width == 32)
+ outp +=
+ sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+ (unsigned int)p->perf_counter[i]);
+ else
+ outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]);
+ } else if (pp->format == FORMAT_DELTA) {
+ if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+ outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]);
+ else
+ outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]);
+ } else if (pp->format == FORMAT_PERCENT) {
+ outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc);
+ } else if (pp->type == COUNTER_K2M) {
+ outp +=
+ sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000);
+ }
+ }
+
+ for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+ switch (ppmt->type) {
+ case PMT_TYPE_RAW:
+ if (pmt_counter_get_width(ppmt) <= 32)
+ outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+ (unsigned int)p->pmt_counter[i]);
+ else
+ outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]);
+
+ break;
+
+ case PMT_TYPE_XTAL_TIME:
+ const unsigned long value_raw = p->pmt_counter[i];
+ const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+
+ outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
+ break;
+ }
+ }
+
done:
if (*(outp - 1) != '\n')
outp += sprintf(outp, "\n");
@@ -2654,6 +3140,8 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
{
int i;
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
if (DO_BIC(BIC_Totl_c0))
old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
@@ -2674,6 +3162,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
old->pc8 = new->pc8 - old->pc8;
old->pc9 = new->pc9 - old->pc9;
old->pc10 = new->pc10 - old->pc10;
+ old->die_c6 = new->die_c6 - old->die_c6;
old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
old->sys_lpi = new->sys_lpi - old->sys_lpi;
old->pkg_temp_c = new->pkg_temp_c;
@@ -2714,6 +3203,22 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
old->counter[i] = new->counter[i] - old->counter[i];
}
+ for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ old->perf_counter[i] = new->perf_counter[i];
+ else if (pp->format == FORMAT_AVERAGE)
+ old->perf_counter[i] = new->perf_counter[i];
+ else
+ old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
+ }
+
+ for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+ if (ppmt->format == FORMAT_RAW)
+ old->pmt_counter[i] = new->pmt_counter[i];
+ else
+ old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
+ }
+
return 0;
}
@@ -2721,6 +3226,8 @@ void delta_core(struct core_data *new, struct core_data *old)
{
int i;
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
old->c3 = new->c3 - old->c3;
old->c6 = new->c6 - old->c6;
@@ -2737,6 +3244,20 @@ void delta_core(struct core_data *new, struct core_data *old)
else
old->counter[i] = new->counter[i] - old->counter[i];
}
+
+ for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ old->perf_counter[i] = new->perf_counter[i];
+ else
+ old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
+ }
+
+ for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+ if (ppmt->format == FORMAT_RAW)
+ old->pmt_counter[i] = new->pmt_counter[i];
+ else
+ old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
+ }
}
int soft_c1_residency_display(int bic)
@@ -2754,6 +3275,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
{
int i;
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
/* we run cpuid just the 1st time, copy the results */
if (DO_BIC(BIC_APIC))
@@ -2832,6 +3355,21 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
else
old->counter[i] = new->counter[i] - old->counter[i];
}
+
+ for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ old->perf_counter[i] = new->perf_counter[i];
+ else
+ old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
+ }
+
+ for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+ if (ppmt->format == FORMAT_RAW)
+ old->pmt_counter[i] = new->pmt_counter[i];
+ else
+ old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
+ }
+
return 0;
}
@@ -2908,6 +3446,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
p->pc8 = 0;
p->pc9 = 0;
p->pc10 = 0;
+ p->die_c6 = 0;
p->cpu_lpi = 0;
p->sys_lpi = 0;
@@ -2934,6 +3473,14 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
p->counter[i] = 0;
+
+ memset(&t->perf_counter[0], 0, sizeof(t->perf_counter));
+ memset(&c->perf_counter[0], 0, sizeof(c->perf_counter));
+ memset(&p->perf_counter[0], 0, sizeof(p->perf_counter));
+
+ memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter));
+ memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter));
+ memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter));
}
void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
@@ -2954,6 +3501,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
{
int i;
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
/* copy un-changing apic_id's */
if (DO_BIC(BIC_APIC))
@@ -2984,6 +3533,16 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
average.threads.counter[i] += t->counter[i];
}
+ for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ continue;
+ average.threads.perf_counter[i] += t->perf_counter[i];
+ }
+
+ for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+ average.threads.pmt_counter[i] += t->pmt_counter[i];
+ }
+
/* sum per-core values only for 1st thread in core */
if (!is_cpu_first_thread_in_core(t, c, p))
return 0;
@@ -3004,6 +3563,16 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
average.cores.counter[i] += c->counter[i];
}
+ for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ continue;
+ average.cores.perf_counter[i] += c->perf_counter[i];
+ }
+
+ for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+ average.cores.pmt_counter[i] += c->pmt_counter[i];
+ }
+
/* sum per-pkg values only for 1st core in pkg */
if (!is_cpu_first_core_in_package(t, c, p))
return 0;
@@ -3027,6 +3596,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
average.packages.pc8 += p->pc8;
average.packages.pc9 += p->pc9;
average.packages.pc10 += p->pc10;
+ average.packages.die_c6 += p->die_c6;
average.packages.cpu_lpi = p->cpu_lpi;
average.packages.sys_lpi = p->sys_lpi;
@@ -3055,6 +3625,18 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
else
average.packages.counter[i] += p->counter[i];
}
+
+ for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+ if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0))
+ average.packages.perf_counter[i] = p->perf_counter[i];
+ else
+ average.packages.perf_counter[i] += p->perf_counter[i];
+ }
+
+ for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+ average.packages.pmt_counter[i] += p->pmt_counter[i];
+ }
+
return 0;
}
@@ -3066,6 +3648,8 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
{
int i;
struct msr_counter *mp;
+ struct perf_counter_info *pp;
+ struct pmt_counter *ppmt;
clear_counters(&average.threads, &average.cores, &average.packages);
@@ -3108,6 +3692,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
average.packages.pc8 /= topo.allowed_packages;
average.packages.pc9 /= topo.allowed_packages;
average.packages.pc10 /= topo.allowed_packages;
+ average.packages.die_c6 /= topo.allowed_packages;
for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
if (mp->format == FORMAT_RAW)
@@ -3137,6 +3722,45 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
}
average.packages.counter[i] /= topo.allowed_packages;
}
+
+ for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ continue;
+ if (pp->type == COUNTER_ITEMS) {
+ if (average.threads.perf_counter[i] > 9999999)
+ sums_need_wide_columns = 1;
+ continue;
+ }
+ average.threads.perf_counter[i] /= topo.allowed_cpus;
+ }
+ for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ continue;
+ if (pp->type == COUNTER_ITEMS) {
+ if (average.cores.perf_counter[i] > 9999999)
+ sums_need_wide_columns = 1;
+ }
+ average.cores.perf_counter[i] /= topo.allowed_cores;
+ }
+ for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+ if (pp->format == FORMAT_RAW)
+ continue;
+ if (pp->type == COUNTER_ITEMS) {
+ if (average.packages.perf_counter[i] > 9999999)
+ sums_need_wide_columns = 1;
+ }
+ average.packages.perf_counter[i] /= topo.allowed_packages;
+ }
+
+ for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+ average.threads.pmt_counter[i] /= topo.allowed_cpus;
+ }
+ for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+ average.cores.pmt_counter[i] /= topo.allowed_cores;
+ }
+ for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+ average.packages.pmt_counter[i] /= topo.allowed_packages;
+ }
}
static unsigned long long rdtsc(void)
@@ -3382,30 +4006,6 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char
return v;
}
-static unsigned int read_msr_type(void)
-{
- const char *const path = "/sys/bus/event_source/devices/msr/type";
- const char *const format = "%u";
-
- return read_perf_counter_info_n(path, format);
-}
-
-static unsigned int read_aperf_config(void)
-{
- const char *const path = "/sys/bus/event_source/devices/msr/events/aperf";
- const char *const format = "event=%x";
-
- return read_perf_counter_info_n(path, format);
-}
-
-static unsigned int read_mperf_config(void)
-{
- const char *const path = "/sys/bus/event_source/devices/msr/events/mperf";
- const char *const format = "event=%x";
-
- return read_perf_counter_info_n(path, format);
-}
-
static unsigned int read_perf_type(const char *subsys)
{
const char *const path_format = "/sys/bus/event_source/devices/%s/type";
@@ -3417,15 +4017,55 @@ static unsigned int read_perf_type(const char *subsys)
return read_perf_counter_info_n(path, format);
}
-static unsigned int read_rapl_config(const char *subsys, const char *event_name)
+static unsigned int read_perf_config(const char *subsys, const char *event_name)
{
const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
- const char *const format = "event=%x";
+ FILE *fconfig = NULL;
char path[128];
+ char config_str[64];
+ unsigned int config;
+ unsigned int umask;
+ bool has_config = false;
+ bool has_umask = false;
+ unsigned int ret = -1;
snprintf(path, sizeof(path), path_format, subsys, event_name);
- return read_perf_counter_info_n(path, format);
+ fconfig = fopen(path, "r");
+ if (!fconfig)
+ return -1;
+
+ if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str)
+ goto cleanup_and_exit;
+
+ for (char *pconfig_str = &config_str[0]; pconfig_str;) {
+ if (sscanf(pconfig_str, "event=%x", &config) == 1) {
+ has_config = true;
+ goto next;
+ }
+
+ if (sscanf(pconfig_str, "umask=%x", &umask) == 1) {
+ has_umask = true;
+ goto next;
+ }
+
+next:
+ pconfig_str = strchr(pconfig_str, ',');
+ if (pconfig_str) {
+ *pconfig_str = '\0';
+ ++pconfig_str;
+ }
+ }
+
+ if (!has_umask)
+ umask = 0;
+
+ if (has_config)
+ ret = (umask << 8) | config;
+
+cleanup_and_exit:
+ fclose(fconfig);
+ return ret;
}
static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
@@ -3444,7 +4084,7 @@ static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_na
return RAPL_UNIT_INVALID;
}
-static double read_perf_rapl_scale(const char *subsys, const char *event_name)
+static double read_perf_scale(const char *subsys, const char *event_name)
{
const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
const char *const format = "%lf";
@@ -3459,130 +4099,12 @@ static double read_perf_rapl_scale(const char *subsys, const char *event_name)
return scale;
}
-static struct amperf_group_fd open_amperf_fd(int cpu)
-{
- const unsigned int msr_type = read_msr_type();
- const unsigned int aperf_config = read_aperf_config();
- const unsigned int mperf_config = read_mperf_config();
- struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 };
-
- fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
- fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP);
-
- return fds;
-}
-
-static int get_amperf_fd(int cpu)
-{
- assert(fd_amperf_percpu);
-
- if (fd_amperf_percpu[cpu].aperf)
- return fd_amperf_percpu[cpu].aperf;
-
- fd_amperf_percpu[cpu] = open_amperf_fd(cpu);
-
- return fd_amperf_percpu[cpu].aperf;
-}
-
-/* Read APERF, MPERF and TSC using the perf API. */
-static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
-{
- union {
- struct {
- unsigned long nr_entries;
- unsigned long aperf;
- unsigned long mperf;
- };
-
- unsigned long as_array[3];
- } cnt;
-
- const int fd_amperf = get_amperf_fd(cpu);
-
- /*
- * Read the TSC with rdtsc, because we want the absolute value and not
- * the offset from the start of the counter.
- */
- t->tsc = rdtsc();
-
- const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array));
-
- if (n != sizeof(cnt.as_array))
- return -2;
-
- t->aperf = cnt.aperf * aperf_mperf_multiplier;
- t->mperf = cnt.mperf * aperf_mperf_multiplier;
-
- return 0;
-}
-
-/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
-static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
-{
- unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
- int aperf_mperf_retry_count = 0;
-
- /*
- * The TSC, APERF and MPERF must be read together for
- * APERF/MPERF and MPERF/TSC to give accurate results.
- *
- * Unfortunately, APERF and MPERF are read by
- * individual system call, so delays may occur
- * between them. If the time to read them
- * varies by a large amount, we re-read them.
- */
-
- /*
- * This initial dummy APERF read has been seen to
- * reduce jitter in the subsequent reads.
- */
-
- if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
- return -3;
-
-retry:
- t->tsc = rdtsc(); /* re-read close to APERF */
-
- tsc_before = t->tsc;
-
- if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
- return -3;
-
- tsc_between = rdtsc();
-
- if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
- return -4;
-
- tsc_after = rdtsc();
-
- aperf_time = tsc_between - tsc_before;
- mperf_time = tsc_after - tsc_between;
-
- /*
- * If the system call latency to read APERF and MPERF
- * differ by more than 2x, then try again.
- */
- if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
- aperf_mperf_retry_count++;
- if (aperf_mperf_retry_count < 5)
- goto retry;
- else
- warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
- }
- aperf_mperf_retry_count = 0;
-
- t->aperf = t->aperf * aperf_mperf_multiplier;
- t->mperf = t->mperf * aperf_mperf_multiplier;
-
- return 0;
-}
-
size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
{
size_t ret = 0;
for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
- if (rci->source[i] == RAPL_SOURCE_PERF)
+ if (rci->source[i] == COUNTER_SOURCE_PERF)
++ret;
return ret;
@@ -3593,7 +4115,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t
size_t ret = 0;
for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i)
- if (cci->source[i] == CSTATE_SOURCE_PERF)
+ if (cci->source[i] == COUNTER_SOURCE_PERF)
++ret;
return ret;
@@ -3611,7 +4133,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
struct rapl_counter_info_t *rci;
- if (debug)
+ if (debug >= 2)
fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
assert(rapl_counter_info_perdomain);
@@ -3634,14 +4156,14 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
switch (rci->source[i]) {
- case RAPL_SOURCE_NONE:
+ case COUNTER_SOURCE_NONE:
break;
- case RAPL_SOURCE_PERF:
+ case COUNTER_SOURCE_PERF:
assert(pi < ARRAY_SIZE(perf_data));
assert(rci->fd_perf != -1);
- if (debug)
+ if (debug >= 2)
fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
@@ -3650,8 +4172,8 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
++pi;
break;
- case RAPL_SOURCE_MSR:
- if (debug)
+ case COUNTER_SOURCE_MSR:
+ if (debug >= 2)
fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
assert(!no_msr);
@@ -3709,15 +4231,15 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
struct cstate_counter_info_t *cci;
- if (debug)
+ if (debug >= 2)
fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
assert(ccstate_counter_info);
assert(cpu <= ccstate_counter_info_size);
- memset(perf_data, 0, sizeof(perf_data));
- memset(perf_data_core, 0, sizeof(perf_data_core));
- memset(perf_data_pkg, 0, sizeof(perf_data_pkg));
+ ZERO_ARRAY(perf_data);
+ ZERO_ARRAY(perf_data_core);
+ ZERO_ARRAY(perf_data_pkg);
cci = &ccstate_counter_info[cpu];
@@ -3772,30 +4294,28 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) {
switch (cci->source[i]) {
- case CSTATE_SOURCE_NONE:
+ case COUNTER_SOURCE_NONE:
break;
- case CSTATE_SOURCE_PERF:
+ case COUNTER_SOURCE_PERF:
assert(pi < ARRAY_SIZE(perf_data));
assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1);
- if (debug) {
+ if (debug >= 2)
fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]);
- }
cci->data[i] = perf_data[pi];
++pi;
break;
- case CSTATE_SOURCE_MSR:
+ case COUNTER_SOURCE_MSR:
assert(!no_msr);
if (get_msr(cpu, cci->msr[i], &cci->data[i]))
return -13 - i;
- if (debug) {
+ if (debug >= 2)
fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]);
- }
break;
}
@@ -3809,7 +4329,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
* when invoked for the thread sibling.
*/
#define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \
- if (cci->source[index] != CSTATE_SOURCE_NONE) \
+ if (cci->source[index] != COUNTER_SOURCE_NONE) \
out_counter = cci->data[index]; \
} while (0)
@@ -3833,6 +4353,135 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
return 0;
}
+size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci)
+{
+ size_t ret = 0;
+
+ for (int i = 0; i < NUM_MSR_COUNTERS; ++i)
+ if (mci->source[i] == COUNTER_SOURCE_PERF)
+ ++ret;
+
+ return ret;
+}
+
+int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
+{
+ unsigned long long perf_data[NUM_MSR_COUNTERS + 1];
+
+ struct msr_counter_info_t *mci;
+
+ if (debug >= 2)
+ fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
+
+ assert(msr_counter_info);
+ assert(cpu <= msr_counter_info_size);
+
+ mci = &msr_counter_info[cpu];
+
+ ZERO_ARRAY(perf_data);
+ ZERO_ARRAY(mci->data);
+
+ if (mci->fd_perf != -1) {
+ const size_t num_perf_counters = msr_counter_info_count_perf(mci);
+ const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
+ const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data));
+
+ if (actual_read_size != expected_read_size)
+ err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
+ actual_read_size);
+ }
+
+ for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) {
+ switch (mci->source[i]) {
+ case COUNTER_SOURCE_NONE:
+ break;
+
+ case COUNTER_SOURCE_PERF:
+ assert(pi < ARRAY_SIZE(perf_data));
+ assert(mci->fd_perf != -1);
+
+ if (debug >= 2)
+ fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]);
+
+ mci->data[i] = perf_data[pi];
+
+ ++pi;
+ break;
+
+ case COUNTER_SOURCE_MSR:
+ assert(!no_msr);
+
+ if (get_msr(cpu, mci->msr[i], &mci->data[i]))
+ return -2 - i;
+
+ mci->data[i] &= mci->msr_mask[i];
+
+ if (debug >= 2)
+ fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]);
+
+ break;
+ }
+ }
+
+ BUILD_BUG_ON(NUM_MSR_COUNTERS != 3);
+ t->aperf = mci->data[MSR_RCI_INDEX_APERF];
+ t->mperf = mci->data[MSR_RCI_INDEX_MPERF];
+ t->smi_count = mci->data[MSR_RCI_INDEX_SMI];
+
+ return 0;
+}
+
+int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size)
+{
+ unsigned int domain;
+ unsigned long long value;
+ int fd_counter;
+
+ for (size_t i = 0; pp; ++i, pp = pp->next) {
+ domain = cpu_to_domain(pp, cpu);
+ assert(domain < pp->num_domains);
+
+ fd_counter = pp->fd_perf_per_domain[domain];
+
+ if (fd_counter == -1)
+ continue;
+
+ if (read(fd_counter, &value, sizeof(value)) != sizeof(value))
+ return 1;
+
+ assert(i < out_size);
+ out[i] = value * pp->scale;
+ }
+
+ return 0;
+}
+
+unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb)
+{
+ unsigned long mask;
+
+ if (msb == 63)
+ mask = 0xffffffffffffffff;
+ else
+ mask = ((1 << (msb + 1)) - 1);
+
+ mask -= (1 << lsb) - 1;
+
+ return mask;
+}
+
+unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
+{
+ assert(domain_id < ppmt->num_domains);
+
+ const unsigned long *pmmio = ppmt->domains[domain_id].pcounter;
+ const unsigned long value = pmmio ? *pmmio : 0;
+ const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb);
+ const unsigned long value_shift = ppmt->lsb;
+
+ return (value & value_mask) >> value_shift;
+}
+
/*
* get_counters(...)
* migrate to cpu
@@ -3843,6 +4492,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
int cpu = t->cpu_id;
unsigned long long msr;
struct msr_counter *mp;
+ struct pmt_counter *pp;
int i;
int status;
@@ -3858,24 +4508,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
t->tsc = rdtsc(); /* we are running on local CPU of interest */
- if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
- || soft_c1_residency_display(BIC_Avg_MHz)) {
- int status = -1;
-
- assert(!no_perf || !no_msr);
-
- switch (amperf_source) {
- case AMPERF_SOURCE_PERF:
- status = read_aperf_mperf_tsc_perf(t, cpu);
- break;
- case AMPERF_SOURCE_MSR:
- status = read_aperf_mperf_tsc_msr(t, cpu);
- break;
- }
-
- if (status != 0)
- return status;
- }
+ get_smi_aperf_mperf(cpu, t);
if (DO_BIC(BIC_IPC))
if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
@@ -3883,11 +4516,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
if (DO_BIC(BIC_IRQ))
t->irq_count = irqs_per_cpu[cpu];
- if (DO_BIC(BIC_SMI)) {
- if (get_msr(cpu, MSR_SMI_COUNT, &msr))
- return -5;
- t->smi_count = msr & 0xFFFFFFFF;
- }
get_cstate_counters(cpu, t, c, p);
@@ -3896,6 +4524,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
return -10;
}
+ if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS))
+ return -10;
+
+ for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next)
+ t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id);
+
/* collect core counters only for 1st thread in core */
if (!is_cpu_first_thread_in_core(t, c, p))
goto done;
@@ -3934,6 +4568,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
return -10;
}
+ if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS))
+ return -10;
+
+ for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next)
+ c->pmt_counter[i] = pmt_read_counter(pp, c->core_id);
+
/* collect package counters only for 1st core in package */
if (!is_cpu_first_core_in_package(t, c, p))
goto done;
@@ -4006,6 +4646,13 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
if (get_mp(cpu, mp, &p->counter[i], path))
return -10;
}
+
+ if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS))
+ return -10;
+
+ for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next)
+ p->pmt_counter[i] = pmt_read_counter(pp, p->package_id);
+
done:
gettimeofday(&t->tv_end, (struct timezone *)NULL);
@@ -4469,25 +5116,6 @@ void free_fd_percpu(void)
fd_percpu = NULL;
}
-void free_fd_amperf_percpu(void)
-{
- int i;
-
- if (!fd_amperf_percpu)
- return;
-
- for (i = 0; i < topo.max_cpu_num + 1; ++i) {
- if (fd_amperf_percpu[i].mperf != 0)
- close(fd_amperf_percpu[i].mperf);
-
- if (fd_amperf_percpu[i].aperf != 0)
- close(fd_amperf_percpu[i].aperf);
- }
-
- free(fd_amperf_percpu);
- fd_amperf_percpu = NULL;
-}
-
void free_fd_instr_count_percpu(void)
{
if (!fd_instr_count_percpu)
@@ -4522,6 +5150,21 @@ void free_fd_cstate(void)
ccstate_counter_info_size = 0;
}
+void free_fd_msr(void)
+{
+ if (!msr_counter_info)
+ return;
+
+ for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) {
+ if (msr_counter_info[cpu].fd_perf != -1)
+ close(msr_counter_info[cpu].fd_perf);
+ }
+
+ free(msr_counter_info);
+ msr_counter_info = NULL;
+ msr_counter_info_size = 0;
+}
+
void free_fd_rapl_percpu(void)
{
if (!rapl_counter_info_perdomain)
@@ -4539,6 +5182,36 @@ void free_fd_rapl_percpu(void)
rapl_counter_info_perdomain_size = 0;
}
+void free_fd_added_perf_counters_(struct perf_counter_info *pp)
+{
+ if (!pp)
+ return;
+
+ if (!pp->fd_perf_per_domain)
+ return;
+
+ while (pp) {
+ for (size_t domain = 0; domain < pp->num_domains; ++domain) {
+ if (pp->fd_perf_per_domain[domain] != -1) {
+ close(pp->fd_perf_per_domain[domain]);
+ pp->fd_perf_per_domain[domain] = -1;
+ }
+ }
+
+ free(pp->fd_perf_per_domain);
+ pp->fd_perf_per_domain = NULL;
+
+ pp = pp->next;
+ }
+}
+
+void free_fd_added_perf_counters(void)
+{
+ free_fd_added_perf_counters_(sys.perf_tp);
+ free_fd_added_perf_counters_(sys.perf_cp);
+ free_fd_added_perf_counters_(sys.perf_pp);
+}
+
void free_all_buffers(void)
{
int i;
@@ -4581,9 +5254,10 @@ void free_all_buffers(void)
free_fd_percpu();
free_fd_instr_count_percpu();
- free_fd_amperf_percpu();
+ free_fd_msr();
free_fd_rapl_percpu();
free_fd_cstate();
+ free_fd_added_perf_counters();
free(irq_column_2_cpu);
free(irqs_per_cpu);
@@ -4918,16 +5592,22 @@ static void update_effective_set(bool startup)
}
void linux_perf_init(void);
+void msr_perf_init(void);
void rapl_perf_init(void);
void cstate_perf_init(void);
+void added_perf_counters_init(void);
+void pmt_init(void);
void re_initialize(void)
{
free_all_buffers();
setup_all_buffers(false);
linux_perf_init();
+ msr_perf_init();
rapl_perf_init();
cstate_perf_init();
+ added_perf_counters_init();
+ pmt_init();
fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
topo.allowed_cpus);
}
@@ -6779,22 +7459,13 @@ static int has_instr_count_access(void)
return has_access;
}
-bool is_aperf_access_required(void)
-{
- return BIC_IS_ENABLED(BIC_Avg_MHz)
- || BIC_IS_ENABLED(BIC_Busy)
- || BIC_IS_ENABLED(BIC_Bzy_MHz)
- || BIC_IS_ENABLED(BIC_IPC)
- || BIC_IS_ENABLED(BIC_CPU_c1);
-}
-
int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
double *scale_, enum rapl_unit *unit_)
{
if (no_perf)
return -1;
- const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name);
+ const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name);
if (scale == 0.0)
return -1;
@@ -6805,7 +7476,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc
return -1;
const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
- const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+ const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name);
const int fd_counter =
open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
@@ -6826,7 +7497,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct
{
int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
- if (debug)
+ if (debug >= 2)
fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
return ret;
@@ -6846,14 +7517,6 @@ void linux_perf_init(void)
if (fd_instr_count_percpu == NULL)
err(-1, "calloc fd_instr_count_percpu");
}
-
- const bool aperf_required = is_aperf_access_required();
-
- if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) {
- fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu));
- if (fd_amperf_percpu == NULL)
- err(-1, "calloc fd_amperf_percpu");
- }
}
void rapl_perf_init(void)
@@ -6875,7 +7538,7 @@ void rapl_perf_init(void)
rci->fd_perf = -1;
for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
rci->data[i] = 0;
- rci->source[i] = RAPL_SOURCE_NONE;
+ rci->source[i] = COUNTER_SOURCE_NONE;
}
}
@@ -6917,14 +7580,14 @@ void rapl_perf_init(void)
/* Use perf API for this counter */
if (!no_perf && cai->perf_name
&& add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
- rci->source[cai->rci_index] = RAPL_SOURCE_PERF;
+ rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
rci->scale[cai->rci_index] = scale * cai->compat_scale;
rci->unit[cai->rci_index] = unit;
rci->flags[cai->rci_index] = cai->flags;
/* Use MSR for this counter */
} else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
- rci->source[cai->rci_index] = RAPL_SOURCE_MSR;
+ rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
rci->msr[cai->rci_index] = cai->msr;
rci->msr_mask[cai->rci_index] = cai->msr_mask;
rci->msr_shift[cai->rci_index] = cai->msr_shift;
@@ -6934,7 +7597,7 @@ void rapl_perf_init(void)
}
}
- if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE)
+ if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
has_counter = 1;
}
@@ -6946,75 +7609,11 @@ void rapl_perf_init(void)
free(domain_visited);
}
-static int has_amperf_access_via_msr(void)
-{
- if (no_msr)
- return 0;
-
- if (probe_msr(base_cpu, MSR_IA32_APERF))
- return 0;
-
- if (probe_msr(base_cpu, MSR_IA32_MPERF))
- return 0;
-
- return 1;
-}
-
-static int has_amperf_access_via_perf(void)
-{
- struct amperf_group_fd fds;
-
- /*
- * Cache the last result, so we don't warn the user multiple times
- *
- * Negative means cached, no access
- * Zero means not cached
- * Positive means cached, has access
- */
- static int has_access_cached;
-
- if (no_perf)
- return 0;
-
- if (has_access_cached != 0)
- return has_access_cached > 0;
-
- fds = open_amperf_fd(base_cpu);
- has_access_cached = (fds.aperf != -1) && (fds.mperf != -1);
-
- if (fds.aperf == -1)
- warnx("Failed to access %s. Some of the counters may not be available\n"
- "\tRun as root to enable them or use %s to disable the access explicitly",
- "APERF perf counter", "--no-perf");
- else
- close(fds.aperf);
-
- if (fds.mperf == -1)
- warnx("Failed to access %s. Some of the counters may not be available\n"
- "\tRun as root to enable them or use %s to disable the access explicitly",
- "MPERF perf counter", "--no-perf");
- else
- close(fds.mperf);
-
- if (has_access_cached == 0)
- has_access_cached = -1;
-
- return has_access_cached > 0;
-}
-
-/* Check if we can access APERF and MPERF */
+/* Assumes msr_counter_info is populated */
static int has_amperf_access(void)
{
- if (!is_aperf_access_required())
- return 0;
-
- if (!no_msr && has_amperf_access_via_msr())
- return 1;
-
- if (!no_perf && has_amperf_access_via_perf())
- return 1;
-
- return 0;
+ return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present &&
+ msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present;
}
int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name)
@@ -7039,7 +7638,7 @@ int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const s
return -1;
const unsigned int type = read_perf_type(cai->perf_subsys);
- const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+ const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
@@ -7057,12 +7656,120 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st
{
int ret = add_cstate_perf_counter_(cpu, cci, cai);
- if (debug)
+ if (debug >= 2)
fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
return ret;
}
+int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
+{
+ if (no_perf)
+ return -1;
+
+ const unsigned int type = read_perf_type(cai->perf_subsys);
+ const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
+
+ const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
+
+ if (fd_counter == -1)
+ return -1;
+
+ /* If it's the first counter opened, make it a group descriptor */
+ if (cci->fd_perf == -1)
+ cci->fd_perf = fd_counter;
+
+ return fd_counter;
+}
+
+int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
+{
+ int ret = add_msr_perf_counter_(cpu, cci, cai);
+
+ if (debug)
+ fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu);
+
+ return ret;
+}
+
+void msr_perf_init_(void)
+{
+ const int mci_num = topo.max_cpu_num + 1;
+
+ msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info));
+ if (!msr_counter_info)
+ err(1, "calloc msr_counter_info");
+ msr_counter_info_size = mci_num;
+
+ for (int cpu = 0; cpu < mci_num; ++cpu)
+ msr_counter_info[cpu].fd_perf = -1;
+
+ for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) {
+
+ struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx];
+
+ cai->present = false;
+
+ for (int cpu = 0; cpu < mci_num; ++cpu) {
+
+ struct msr_counter_info_t *const cci = &msr_counter_info[cpu];
+
+ if (cpu_is_not_allowed(cpu))
+ continue;
+
+ if (cai->needed) {
+ /* Use perf API for this counter */
+ if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) {
+ cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
+ cai->present = true;
+
+ /* User MSR for this counter */
+ } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
+ cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
+ cci->msr[cai->rci_index] = cai->msr;
+ cci->msr_mask[cai->rci_index] = cai->msr_mask;
+ cai->present = true;
+ }
+ }
+ }
+ }
+}
+
+/* Initialize data for reading perf counters from the MSR group. */
+void msr_perf_init(void)
+{
+ bool need_amperf = false, need_smi = false;
+ const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1);
+
+ need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz)
+ || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1;
+
+ if (BIC_IS_ENABLED(BIC_SMI))
+ need_smi = true;
+
+ /* Enable needed counters */
+ msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf;
+ msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf;
+ msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi;
+
+ msr_perf_init_();
+
+ const bool has_amperf = has_amperf_access();
+ const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present;
+
+ has_aperf_access = has_amperf;
+
+ if (has_amperf) {
+ BIC_PRESENT(BIC_Avg_MHz);
+ BIC_PRESENT(BIC_Busy);
+ BIC_PRESENT(BIC_Bzy_MHz);
+ BIC_PRESENT(BIC_SMI);
+ }
+
+ if (has_smi)
+ BIC_PRESENT(BIC_SMI);
+}
+
void cstate_perf_init_(bool soft_c1)
{
bool has_counter;
@@ -7127,17 +7834,17 @@ void cstate_perf_init_(bool soft_c1)
/* Use perf API for this counter */
if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) {
- cci->source[cai->rci_index] = CSTATE_SOURCE_PERF;
+ cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
/* User MSR for this counter */
} else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit
&& probe_msr(cpu, cai->msr) == 0) {
- cci->source[cai->rci_index] = CSTATE_SOURCE_MSR;
+ cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
cci->msr[cai->rci_index] = cai->msr;
}
}
- if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) {
+ if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) {
has_counter = true;
cores_visited[core_id] = true;
pkg_visited[pkg_id] = true;
@@ -7320,12 +8027,6 @@ void process_cpuid()
__cpuid(0x6, eax, ebx, ecx, edx);
has_aperf = ecx & (1 << 0);
- if (has_aperf && has_amperf_access()) {
- BIC_PRESENT(BIC_Avg_MHz);
- BIC_PRESENT(BIC_Busy);
- BIC_PRESENT(BIC_Bzy_MHz);
- BIC_PRESENT(BIC_IPC);
- }
do_dts = eax & (1 << 0);
if (do_dts)
BIC_PRESENT(BIC_CoreTmp);
@@ -7442,6 +8143,11 @@ static void counter_info_init(void)
if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY)
cai->msr = MSR_ATOM_PKG_C6_RESIDENCY;
}
+
+ for (int i = 0; i < NUM_MSR_COUNTERS; ++i) {
+ msr_counter_arch_infos[i].present = false;
+ msr_counter_arch_infos[i].needed = false;
+ }
}
void probe_pm_features(void)
@@ -7817,100 +8523,446 @@ void set_base_cpu(void)
err(-ENODEV, "No valid cpus found");
}
-static void set_amperf_source(void)
+bool has_added_counters(void)
{
- amperf_source = AMPERF_SOURCE_PERF;
+ /*
+ * It only makes sense to call this after the command line is parsed,
+ * otherwise sys structure is not populated.
+ */
- const bool aperf_required = is_aperf_access_required();
+ return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
+}
- if (no_perf || !aperf_required || !has_amperf_access_via_perf())
- amperf_source = AMPERF_SOURCE_MSR;
+void check_msr_access(void)
+{
+ check_dev_msr();
+ check_msr_permission();
- if (quiet || !debug)
- return;
+ if (no_msr)
+ bic_disable_msr_access();
+}
- fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf");
+void check_perf_access(void)
+{
+ if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access())
+ bic_enabled &= ~BIC_IPC;
}
-bool has_added_counters(void)
+int added_perf_counters_init_(struct perf_counter_info *pinfo)
+{
+ size_t num_domains = 0;
+ unsigned int next_domain;
+ bool *domain_visited;
+ unsigned int perf_type, perf_config;
+ double perf_scale;
+ int fd_perf;
+
+ if (!pinfo)
+ return 0;
+
+ const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1));
+
+ domain_visited = calloc(max_num_domains, sizeof(*domain_visited));
+
+ while (pinfo) {
+ switch (pinfo->scope) {
+ case SCOPE_CPU:
+ num_domains = topo.max_cpu_num + 1;
+ break;
+
+ case SCOPE_CORE:
+ num_domains = topo.max_core_id + 1;
+ break;
+
+ case SCOPE_PACKAGE:
+ num_domains = topo.max_package_id + 1;
+ break;
+ }
+
+ /* Allocate buffer for file descriptor for each domain. */
+ pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain));
+ if (!pinfo->fd_perf_per_domain)
+ errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain");
+
+ for (size_t i = 0; i < num_domains; ++i)
+ pinfo->fd_perf_per_domain[i] = -1;
+
+ pinfo->num_domains = num_domains;
+ pinfo->scale = 1.0;
+
+ memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited));
+
+ for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
+
+ next_domain = cpu_to_domain(pinfo, cpu);
+
+ assert(next_domain < num_domains);
+
+ if (cpu_is_not_allowed(cpu))
+ continue;
+
+ if (domain_visited[next_domain])
+ continue;
+
+ perf_type = read_perf_type(pinfo->device);
+ if (perf_type == (unsigned int)-1) {
+ warnx("%s: perf/%s/%s: failed to read %s",
+ __func__, pinfo->device, pinfo->event, "type");
+ continue;
+ }
+
+ perf_config = read_perf_config(pinfo->device, pinfo->event);
+ if (perf_config == (unsigned int)-1) {
+ warnx("%s: perf/%s/%s: failed to read %s",
+ __func__, pinfo->device, pinfo->event, "config");
+ continue;
+ }
+
+ /* Scale is not required, some counters just don't have it. */
+ perf_scale = read_perf_scale(pinfo->device, pinfo->event);
+ if (perf_scale == 0.0)
+ perf_scale = 1.0;
+
+ fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0);
+ if (fd_perf == -1) {
+ warnx("%s: perf/%s/%s: failed to open counter on cpu%d",
+ __func__, pinfo->device, pinfo->event, cpu);
+ continue;
+ }
+
+ domain_visited[next_domain] = 1;
+ pinfo->fd_perf_per_domain[next_domain] = fd_perf;
+ pinfo->scale = perf_scale;
+
+ if (debug)
+ fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n",
+ pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
+ }
+
+ pinfo = pinfo->next;
+ }
+
+ free(domain_visited);
+
+ return 0;
+}
+
+void added_perf_counters_init(void)
+{
+ if (added_perf_counters_init_(sys.perf_tp))
+ errx(1, "%s: %s", __func__, "thread");
+
+ if (added_perf_counters_init_(sys.perf_cp))
+ errx(1, "%s: %s", __func__, "core");
+
+ if (added_perf_counters_init_(sys.perf_pp))
+ errx(1, "%s: %s", __func__, "package");
+}
+
+int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output)
+{
+ int fd_telem_info;
+ FILE *file_telem_info;
+ unsigned long value;
+
+ fd_telem_info = openat(fd_dir, info_filename, O_RDONLY);
+ if (fd_telem_info == -1)
+ return -1;
+
+ file_telem_info = fdopen(fd_telem_info, "r");
+ if (file_telem_info == NULL) {
+ close(fd_telem_info);
+ return -1;
+ }
+
+ if (fscanf(file_telem_info, format, &value) != 1) {
+ fclose(file_telem_info);
+ return -1;
+ }
+
+ fclose(file_telem_info);
+
+ *output = value;
+
+ return 0;
+}
+
+struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
{
+ DIR *dirp;
+ struct dirent *entry;
+ struct stat st;
+ unsigned int telem_idx;
+ int fd_telem_dir, fd_pmt;
+ unsigned long guid, size, offset;
+ size_t mmap_size;
+ void *mmio;
+ struct pmt_mmio *ret = NULL;
+
+ if (stat(SYSFS_TELEM_PATH, &st) == -1)
+ return NULL;
+
+ dirp = opendir(SYSFS_TELEM_PATH);
+ if (dirp == NULL)
+ return NULL;
+
+ for (;;) {
+ entry = readdir(dirp);
+
+ if (entry == NULL)
+ break;
+
+ if (strcmp(entry->d_name, ".") == 0)
+ continue;
+
+ if (strcmp(entry->d_name, "..") == 0)
+ continue;
+
+ if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1)
+ continue;
+
+ if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) {
+ break;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY);
+ if (fd_telem_dir == -1) {
+ break;
+ }
+
+ if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
+ close(fd_telem_dir);
+ break;
+ }
+
+ if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) {
+ close(fd_telem_dir);
+ break;
+ }
+
+ if (guid != target_guid) {
+ close(fd_telem_dir);
+ continue;
+ }
+
+ if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) {
+ close(fd_telem_dir);
+ break;
+ }
+
+ assert(offset == 0);
+
+ fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY);
+ if (fd_pmt == -1)
+ goto loop_cleanup_and_break;
+
+ mmap_size = (size + 0x1000UL) & (~0x1000UL);
+ mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0);
+ if (mmio != MAP_FAILED) {
+
+ if (debug)
+ fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio);
+
+ ret = calloc(1, sizeof(*ret));
+
+ if (!ret) {
+ fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__);
+ exit(1);
+ }
+
+ ret->guid = guid;
+ ret->mmio_base = mmio;
+ ret->pmt_offset = offset;
+ ret->size = size;
+
+ ret->next = pmt_mmios;
+ pmt_mmios = ret;
+ }
+
+loop_cleanup_and_break:
+ close(fd_pmt);
+ close(fd_telem_dir);
+ break;
+ }
+
+ closedir(dirp);
+
+ return ret;
+}
+
+struct pmt_mmio *pmt_mmio_find(unsigned int guid)
+{
+ struct pmt_mmio *pmmio = pmt_mmios;
+
+ while (pmmio) {
+ if (pmmio->guid == guid)
+ return pmmio;
+
+ pmmio = pmmio->next;
+ }
+
+ return NULL;
+}
+
+void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset)
+{
+ char *ret;
+
+ /* Get base of mmaped PMT file. */
+ ret = (char *)pmmio->mmio_base;
+
/*
- * It only makes sense to call this after the command line is parsed,
- * otherwise sys structure is not populated.
+ * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data.
+ * It's not guaranteed that the mmaped memory begins with the telemetry data
+ * - we might have to apply the offset first.
*/
+ ret += pmmio->pmt_offset;
- return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
+ /* Apply the counter offset to get the address to the mmaped counter. */
+ ret += counter_offset;
+
+ return ret;
}
-bool is_msr_access_required(void)
+struct pmt_mmio *pmt_add_guid(unsigned int guid)
{
- if (no_msr)
- return false;
-
- if (has_added_counters())
- return true;
-
- return BIC_IS_ENABLED(BIC_SMI)
- || BIC_IS_ENABLED(BIC_CPU_c1)
- || BIC_IS_ENABLED(BIC_CPU_c3)
- || BIC_IS_ENABLED(BIC_CPU_c6)
- || BIC_IS_ENABLED(BIC_CPU_c7)
- || BIC_IS_ENABLED(BIC_Mod_c6)
- || BIC_IS_ENABLED(BIC_CoreTmp)
- || BIC_IS_ENABLED(BIC_Totl_c0)
- || BIC_IS_ENABLED(BIC_Any_c0)
- || BIC_IS_ENABLED(BIC_GFX_c0)
- || BIC_IS_ENABLED(BIC_CPUGFX)
- || BIC_IS_ENABLED(BIC_Pkgpc3)
- || BIC_IS_ENABLED(BIC_Pkgpc6)
- || BIC_IS_ENABLED(BIC_Pkgpc2)
- || BIC_IS_ENABLED(BIC_Pkgpc7)
- || BIC_IS_ENABLED(BIC_Pkgpc8)
- || BIC_IS_ENABLED(BIC_Pkgpc9)
- || BIC_IS_ENABLED(BIC_Pkgpc10)
- /* TODO: Multiplex access with perf */
- || BIC_IS_ENABLED(BIC_CorWatt)
- || BIC_IS_ENABLED(BIC_Cor_J)
- || BIC_IS_ENABLED(BIC_PkgWatt)
- || BIC_IS_ENABLED(BIC_CorWatt)
- || BIC_IS_ENABLED(BIC_GFXWatt)
- || BIC_IS_ENABLED(BIC_RAMWatt)
- || BIC_IS_ENABLED(BIC_Pkg_J)
- || BIC_IS_ENABLED(BIC_Cor_J)
- || BIC_IS_ENABLED(BIC_GFX_J)
- || BIC_IS_ENABLED(BIC_RAM_J)
- || BIC_IS_ENABLED(BIC_PKG__)
- || BIC_IS_ENABLED(BIC_RAM__)
- || BIC_IS_ENABLED(BIC_PkgTmp)
- || (is_aperf_access_required() && !has_amperf_access_via_perf());
+ struct pmt_mmio *ret;
+
+ ret = pmt_mmio_find(guid);
+ if (!ret)
+ ret = pmt_mmio_open(guid);
+
+ return ret;
}
-void check_msr_access(void)
+enum pmt_open_mode {
+ PMT_OPEN_TRY, /* Open failure is not an error. */
+ PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */
+};
+
+struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name)
{
- if (!is_msr_access_required())
- no_msr = 1;
+ while (pcounter) {
+ if (strcmp(pcounter->name, name) == 0)
+ break;
- check_dev_msr();
- check_msr_permission();
+ pcounter = pcounter->next;
+ }
- if (no_msr)
- bic_disable_msr_access();
+ return pcounter;
}
-void check_perf_access(void)
+struct pmt_counter **pmt_get_scope_root(enum counter_scope scope)
{
- const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC);
+ switch (scope) {
+ case SCOPE_CPU:
+ return &sys.pmt_tp;
+ case SCOPE_CORE:
+ return &sys.pmt_cp;
+ case SCOPE_PACKAGE:
+ return &sys.pmt_pp;
+ }
- if (no_perf || !intrcount_required || !has_instr_count_access())
- bic_enabled &= ~BIC_IPC;
+ __builtin_unreachable();
+}
- const bool aperf_required = is_aperf_access_required();
+void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id)
+{
+ /* Make sure the new domain fits. */
+ if (domain_id >= pcounter->num_domains)
+ pmt_counter_resize(pcounter, domain_id + 1);
- if (!aperf_required || !has_amperf_access()) {
- bic_enabled &= ~BIC_Avg_MHz;
- bic_enabled &= ~BIC_Busy;
- bic_enabled &= ~BIC_Bzy_MHz;
- bic_enabled &= ~BIC_IPC;
+ assert(pcounter->domains);
+ assert(domain_id < pcounter->num_domains);
+
+ pcounter->domains[domain_id].pcounter = pmmio;
+}
+
+int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
+ unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
+ enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
+{
+ struct pmt_mmio *mmio;
+ struct pmt_counter *pcounter;
+ struct pmt_counter **const pmt_root = pmt_get_scope_root(scope);
+ bool new_counter = false;
+ int conflict = 0;
+
+ if (lsb > msb) {
+ fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name);
+ exit(1);
+ }
+
+ if (msb >= 64) {
+ fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name);
+ exit(1);
+ }
+
+ mmio = pmt_add_guid(guid);
+ if (!mmio) {
+ if (mode != PMT_OPEN_TRY) {
+ fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid);
+ exit(1);
+ }
+
+ return 1;
+ }
+
+ if (offset >= mmio->size) {
+ if (mode != PMT_OPEN_TRY) {
+ fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size);
+ exit(1);
+ }
+
+ return 1;
+ }
+
+ pcounter = pmt_find_counter(*pmt_root, name);
+ if (!pcounter) {
+ pcounter = calloc(1, sizeof(*pcounter));
+ new_counter = true;
+ }
+
+ if (new_counter) {
+ strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1);
+ pcounter->type = type;
+ pcounter->scope = scope;
+ pcounter->lsb = lsb;
+ pcounter->msb = msb;
+ pcounter->format = format;
+ } else {
+ conflict += pcounter->type != type;
+ conflict += pcounter->scope != scope;
+ conflict += pcounter->lsb != lsb;
+ conflict += pcounter->msb != msb;
+ conflict += pcounter->format != format;
+ }
+
+ if (conflict) {
+ fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n",
+ __func__, name);
+ exit(1);
+ }
+
+ pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id);
+
+ if (new_counter) {
+ pcounter->next = *pmt_root;
+ *pmt_root = pcounter;
+ }
+
+ return 0;
+}
+
+void pmt_init(void)
+{
+ if (BIC_IS_ENABLED(BIC_Diec6)) {
+ pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB,
+ PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA,
+ 0, PMT_OPEN_TRY);
}
}
@@ -7923,16 +8975,18 @@ void turbostat_init()
process_cpuid();
counter_info_init();
probe_pm_features();
- set_amperf_source();
+ msr_perf_init();
linux_perf_init();
rapl_perf_init();
cstate_perf_init();
+ added_perf_counters_init();
+ pmt_init();
for_all_cpus(get_cpu_type, ODD_COUNTERS);
for_all_cpus(get_cpu_type, EVEN_COUNTERS);
- if (DO_BIC(BIC_IPC))
- (void)get_instr_count_fd(base_cpu);
+ if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1)
+ BIC_PRESENT(BIC_IPC);
/*
* If TSC tweak is needed, but couldn't get it,
@@ -8017,7 +9071,7 @@ int get_and_dump_counters(void)
void print_version()
{
- fprintf(outf, "turbostat version 2024.05.10 - Len Brown <lenb@kernel.org>\n");
+ fprintf(outf, "turbostat version 2024.07.26 - Len Brown <lenb@kernel.org>\n");
}
#define COMMAND_LINE_SIZE 2048
@@ -8049,7 +9103,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name)
for (mp = head; mp; mp = mp->next) {
if (debug)
- printf("%s: %s %s\n", __func__, name, mp->name);
+ fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name);
if (!strncmp(name, mp->name, strlen(mp->name)))
return mp;
}
@@ -8066,8 +9120,8 @@ int add_counter(unsigned int msr_num, char *path, char *name,
errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
if (debug)
- printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num,
- path, name, width, scope, type, format, flags, id);
+ fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n",
+ __func__, msr_num, path, name, width, scope, type, format, flags, id);
switch (scope) {
@@ -8075,7 +9129,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
msrp = find_msrp_by_name(sys.tp, name);
if (msrp) {
if (debug)
- printf("%s: %s FOUND\n", __func__, name);
+ fprintf(stderr, "%s: %s FOUND\n", __func__, name);
break;
}
if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) {
@@ -8087,7 +9141,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
msrp = find_msrp_by_name(sys.cp, name);
if (msrp) {
if (debug)
- printf("%s: %s FOUND\n", __func__, name);
+ fprintf(stderr, "%s: %s FOUND\n", __func__, name);
break;
}
if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) {
@@ -8099,7 +9153,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
msrp = find_msrp_by_name(sys.pp, name);
if (msrp) {
if (debug)
- printf("%s: %s FOUND\n", __func__, name);
+ fprintf(stderr, "%s: %s FOUND\n", __func__, name);
break;
}
if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) {
@@ -8116,6 +9170,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
msrp = calloc(1, sizeof(struct msr_counter));
if (msrp == NULL)
err(-1, "calloc msr_counter");
+
msrp->msr_num = msr_num;
strncpy(msrp->name, name, NAME_BYTES - 1);
msrp->width = width;
@@ -8156,11 +9211,106 @@ int add_counter(unsigned int msr_num, char *path, char *name,
return 0;
}
-void parse_add_command(char *add_command)
+/*
+ * Initialize the fields used for identifying and opening the counter.
+ *
+ * Defer the initialization of any runtime buffers for actually reading
+ * the counters for when we initialize all perf counters, so we can later
+ * easily call re_initialize().
+ */
+struct perf_counter_info *make_perf_counter_info(const char *perf_device,
+ const char *perf_event,
+ const char *name,
+ unsigned int width,
+ enum counter_scope scope,
+ enum counter_type type, enum counter_format format)
+{
+ struct perf_counter_info *pinfo;
+
+ pinfo = calloc(1, sizeof(*pinfo));
+ if (!pinfo)
+ errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event);
+
+ strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1);
+ strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1);
+
+ strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1);
+ pinfo->width = width;
+ pinfo->scope = scope;
+ pinfo->type = type;
+ pinfo->format = format;
+
+ return pinfo;
+}
+
+int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width,
+ enum counter_scope scope, enum counter_type type, enum counter_format format)
+{
+ struct perf_counter_info *pinfo;
+
+ switch (scope) {
+ case SCOPE_CPU:
+ if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) {
+ warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event);
+ return -1;
+ }
+ break;
+
+ case SCOPE_CORE:
+ if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) {
+ warnx("ignoring core counter perf/%s/%s", perf_device, perf_event);
+ return -1;
+ }
+ break;
+
+ case SCOPE_PACKAGE:
+ if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) {
+ warnx("ignoring package counter perf/%s/%s", perf_device, perf_event);
+ return -1;
+ }
+ break;
+ }
+
+ pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format);
+
+ if (!pinfo)
+ return -1;
+
+ switch (scope) {
+ case SCOPE_CPU:
+ pinfo->next = sys.perf_tp;
+ sys.perf_tp = pinfo;
+ ++sys.added_thread_perf_counters;
+ break;
+
+ case SCOPE_CORE:
+ pinfo->next = sys.perf_cp;
+ sys.perf_cp = pinfo;
+ ++sys.added_core_perf_counters;
+ break;
+
+ case SCOPE_PACKAGE:
+ pinfo->next = sys.perf_pp;
+ sys.perf_pp = pinfo;
+ ++sys.added_package_perf_counters;
+ break;
+ }
+
+ // FIXME: we might not have debug here yet
+ if (debug)
+ fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n",
+ __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
+
+ return 0;
+}
+
+void parse_add_command_msr(char *add_command)
{
int msr_num = 0;
char *path = NULL;
- char name_buffer[NAME_BYTES] = "";
+ char perf_device[PERF_DEV_NAME_BYTES] = "";
+ char perf_event[PERF_EVT_NAME_BYTES] = "";
+ char name_buffer[PERF_NAME_BYTES] = "";
int width = 64;
int fail = 0;
enum counter_scope scope = SCOPE_CPU;
@@ -8175,6 +9325,11 @@ void parse_add_command(char *add_command)
if (sscanf(add_command, "msr%d", &msr_num) == 1)
goto next;
+ BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31);
+ BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31);
+ if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2)
+ goto next;
+
if (*add_command == '/') {
path = add_command;
goto next;
@@ -8222,7 +9377,8 @@ void parse_add_command(char *add_command)
goto next;
}
- if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { /* 18 < NAME_BYTES */
+ BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18);
+ if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {
char *eos;
eos = strchr(name_buffer, ',');
@@ -8239,21 +9395,33 @@ next:
}
}
- if ((msr_num == 0) && (path == NULL)) {
- fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n");
+ if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) {
+ fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n");
fail++;
}
+ /* Test for non-empty perf_device and perf_event */
+ const bool is_perf_counter = perf_device[0] && perf_event[0];
+
/* generate default column header */
if (*name_buffer == '\0') {
- if (width == 32)
- sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
- else
- sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
+ if (is_perf_counter) {
+ snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event);
+ } else {
+ if (width == 32)
+ sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
+ else
+ sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
+ }
}
- if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
- fail++;
+ if (is_perf_counter) {
+ if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format))
+ fail++;
+ } else {
+ if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
+ fail++;
+ }
if (fail) {
help();
@@ -8261,6 +9429,195 @@ next:
}
}
+bool starts_with(const char *str, const char *prefix)
+{
+ return strncmp(prefix, str, strlen(prefix)) == 0;
+}
+
+void parse_add_command_pmt(char *add_command)
+{
+ char *name = NULL;
+ char *type_name = NULL;
+ char *format_name = NULL;
+ unsigned int offset;
+ unsigned int lsb;
+ unsigned int msb;
+ unsigned int guid;
+ unsigned int domain_id;
+ enum counter_scope scope = 0;
+ enum pmt_datatype type = PMT_TYPE_RAW;
+ enum counter_format format = FORMAT_RAW;
+ bool has_offset = false;
+ bool has_lsb = false;
+ bool has_msb = false;
+ bool has_format = true; /* Format has a default value. */
+ bool has_guid = false;
+ bool has_scope = false;
+ bool has_type = true; /* Type has a default value. */
+
+ /* Consume the "pmt," prefix. */
+ add_command = strchr(add_command, ',');
+ if (!add_command) {
+ help();
+ exit(1);
+ }
+ ++add_command;
+
+ while (add_command) {
+ if (starts_with(add_command, "name=")) {
+ name = add_command + strlen("name=");
+ goto next;
+ }
+
+ if (starts_with(add_command, "type=")) {
+ type_name = add_command + strlen("type=");
+ goto next;
+ }
+
+ if (starts_with(add_command, "domain=")) {
+ const size_t prefix_len = strlen("domain=");
+
+ if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) {
+ scope = SCOPE_CPU;
+ has_scope = true;
+ } else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) {
+ scope = SCOPE_CORE;
+ has_scope = true;
+ } else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) {
+ scope = SCOPE_PACKAGE;
+ has_scope = true;
+ }
+
+ if (!has_scope) {
+ printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n",
+ __func__);
+ exit(1);
+ }
+
+ goto next;
+ }
+
+ if (starts_with(add_command, "format=")) {
+ format_name = add_command + strlen("format=");
+ goto next;
+ }
+
+ if (sscanf(add_command, "offset=%u", &offset) == 1) {
+ has_offset = true;
+ goto next;
+ }
+
+ if (sscanf(add_command, "lsb=%u", &lsb) == 1) {
+ has_lsb = true;
+ goto next;
+ }
+
+ if (sscanf(add_command, "msb=%u", &msb) == 1) {
+ has_msb = true;
+ goto next;
+ }
+
+ if (sscanf(add_command, "guid=%x", &guid) == 1) {
+ has_guid = true;
+ goto next;
+ }
+
+next:
+ add_command = strchr(add_command, ',');
+ if (add_command) {
+ *add_command = '\0';
+ add_command++;
+ }
+ }
+
+ if (!name) {
+ printf("%s: missing %s\n", __func__, "name");
+ exit(1);
+ }
+
+ if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) {
+ printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES);
+ exit(1);
+ }
+
+ if (format_name) {
+ has_format = false;
+
+ if (strcmp("raw", format_name) == 0) {
+ format = FORMAT_RAW;
+ has_format = true;
+ }
+
+ if (strcmp("delta", format_name) == 0) {
+ format = FORMAT_DELTA;
+ has_format = true;
+ }
+
+ if (!has_format) {
+ fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name);
+ exit(1);
+ }
+ }
+
+ if (type_name) {
+ has_type = false;
+
+ if (strcmp("raw", type_name) == 0) {
+ type = PMT_TYPE_RAW;
+ has_type = true;
+ }
+
+ if (strcmp("txtal_time", type_name) == 0) {
+ type = PMT_TYPE_XTAL_TIME;
+ has_type = true;
+ }
+
+ if (!has_type) {
+ printf("%s: invalid %s: %s\n", __func__, "type", type_name);
+ exit(1);
+ }
+ }
+
+ if (!has_offset) {
+ printf("%s : missing %s\n", __func__, "offset");
+ exit(1);
+ }
+
+ if (!has_lsb) {
+ printf("%s: missing %s\n", __func__, "lsb");
+ exit(1);
+ }
+
+ if (!has_msb) {
+ printf("%s: missing %s\n", __func__, "msb");
+ exit(1);
+ }
+
+ if (!has_guid) {
+ printf("%s: missing %s\n", __func__, "guid");
+ exit(1);
+ }
+
+ if (!has_scope) {
+ printf("%s: missing %s\n", __func__, "scope");
+ exit(1);
+ }
+
+ if (lsb > msb) {
+ printf("%s: lsb > msb doesn't make sense\n", __func__);
+ exit(1);
+ }
+
+ pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED);
+}
+
+void parse_add_command(char *add_command)
+{
+ if (strncmp(add_command, "pmt", strlen("pmt")) == 0)
+ return parse_add_command_pmt(add_command);
+ return parse_add_command_msr(add_command);
+}
+
int is_deferred_add(char *name)
{
int i;
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 030b388800f0..3d1ca9e38b1f 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -14,6 +14,7 @@ ldflags-y += --wrap=cxl_dvsec_rr_decode
ldflags-y += --wrap=devm_cxl_add_rch_dport
ldflags-y += --wrap=cxl_rcd_component_reg_phys
ldflags-y += --wrap=cxl_endpoint_parse_cdat
+ldflags-y += --wrap=cxl_setup_parent_dport
DRIVERS := ../../../drivers
CXL_SRC := $(DRIVERS)/cxl
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index eaf091a3d331..129f179b0ac5 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -385,19 +385,21 @@ struct cxl_test_gen_media {
struct cxl_test_gen_media gen_media = {
.id = CXL_EVENT_GEN_MEDIA_UUID,
.rec = {
- .hdr = {
- .length = sizeof(struct cxl_test_gen_media),
- .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
- /* .handle = Set dynamically */
- .related_handle = cpu_to_le16(0),
+ .media_hdr = {
+ .hdr = {
+ .length = sizeof(struct cxl_test_gen_media),
+ .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
+ /* .handle = Set dynamically */
+ .related_handle = cpu_to_le16(0),
+ },
+ .phys_addr = cpu_to_le64(0x2000),
+ .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
+ .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
+ .transaction_type = CXL_GMER_TRANS_HOST_WRITE,
+ /* .validity_flags = <set below> */
+ .channel = 1,
+ .rank = 30,
},
- .phys_addr = cpu_to_le64(0x2000),
- .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
- .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
- .transaction_type = CXL_GMER_TRANS_HOST_WRITE,
- /* .validity_flags = <set below> */
- .channel = 1,
- .rank = 30
},
};
@@ -409,18 +411,20 @@ struct cxl_test_dram {
struct cxl_test_dram dram = {
.id = CXL_EVENT_DRAM_UUID,
.rec = {
- .hdr = {
- .length = sizeof(struct cxl_test_dram),
- .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
- /* .handle = Set dynamically */
- .related_handle = cpu_to_le16(0),
+ .media_hdr = {
+ .hdr = {
+ .length = sizeof(struct cxl_test_dram),
+ .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
+ /* .handle = Set dynamically */
+ .related_handle = cpu_to_le16(0),
+ },
+ .phys_addr = cpu_to_le64(0x8000),
+ .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
+ .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
+ .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
+ /* .validity_flags = <set below> */
+ .channel = 1,
},
- .phys_addr = cpu_to_le64(0x8000),
- .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
- .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
- .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
- /* .validity_flags = <set below> */
- .channel = 1,
.bank_group = 5,
.bank = 2,
.column = {0xDE, 0xAD},
@@ -474,11 +478,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds,
static void cxl_mock_add_event_logs(struct mock_event_store *mes)
{
put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK,
- &gen_media.rec.validity_flags);
+ &gen_media.rec.media_hdr.validity_flags);
put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP |
CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN,
- &dram.rec.validity_flags);
+ &dram.rec.media_hdr.validity_flags);
mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
mes_add_event(mes, CXL_EVENT_TYPE_INFO,
@@ -1131,27 +1135,28 @@ static bool mock_poison_dev_max_injected(struct cxl_dev_state *cxlds)
return (count >= poison_inject_dev_max);
}
-static bool mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa)
+static int mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa)
{
+ /* Return EBUSY to match the CXL driver handling */
if (mock_poison_dev_max_injected(cxlds)) {
dev_dbg(cxlds->dev,
"Device poison injection limit has been reached: %d\n",
- MOCK_INJECT_DEV_MAX);
- return false;
+ poison_inject_dev_max);
+ return -EBUSY;
}
for (int i = 0; i < MOCK_INJECT_TEST_MAX; i++) {
if (!mock_poison_list[i].cxlds) {
mock_poison_list[i].cxlds = cxlds;
mock_poison_list[i].dpa = dpa;
- return true;
+ return 0;
}
}
dev_dbg(cxlds->dev,
"Mock test poison injection limit has been reached: %d\n",
MOCK_INJECT_TEST_MAX);
- return false;
+ return -ENXIO;
}
static bool mock_poison_found(struct cxl_dev_state *cxlds, u64 dpa)
@@ -1175,10 +1180,8 @@ static int mock_inject_poison(struct cxl_dev_state *cxlds,
dev_dbg(cxlds->dev, "DPA: 0x%llx already poisoned\n", dpa);
return 0;
}
- if (!mock_poison_add(cxlds, dpa))
- return -ENXIO;
- return 0;
+ return mock_poison_add(cxlds, dpa);
}
static bool mock_poison_del(struct cxl_dev_state *cxlds, u64 dpa)
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 6f737941dc0e..d619672faa49 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -299,6 +299,18 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port)
}
EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, CXL);
+void __wrap_cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport)
+{
+ int index;
+ struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
+
+ if (!ops || !ops->is_mock_port(dport->dport_dev))
+ cxl_setup_parent_dport(host, dport);
+
+ put_cxl_mock_ops(index);
+}
+EXPORT_SYMBOL_NS_GPL(__wrap_cxl_setup_parent_dport, CXL);
+
MODULE_LICENSE("GPL v2");
MODULE_IMPORT_NS(ACPI);
MODULE_IMPORT_NS(CXL);
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index ea956082e6a4..e4313726fae3 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -407,4 +407,5 @@ union acpi_object * __wrap_acpi_evaluate_dsm(acpi_handle handle, const guid_t *g
}
EXPORT_SYMBOL(__wrap_acpi_evaluate_dsm);
+MODULE_DESCRIPTION("NVDIMM unit test");
MODULE_LICENSE("GPL v2");
diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
index b438f3d053ee..892e990c034a 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -987,5 +987,6 @@ static __exit void ndtest_exit(void)
module_init(ndtest_init);
module_exit(ndtest_exit);
+MODULE_DESCRIPTION("Test non-NFIT devices");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("IBM Corporation");
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index a61df347a33d..cfd4378e2129 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -3382,5 +3382,6 @@ static __exit void nfit_test_exit(void)
module_init(nfit_test_init);
module_exit(nfit_test_exit);
+MODULE_DESCRIPTION("Test ACPI NFIT devices");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Intel Corporation");
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 7527f738b4a1..d1acd7d58850 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -5,8 +5,8 @@ CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \
LDFLAGS += -fsanitize=address -fsanitize=undefined
LDLIBS+= -lpthread -lurcu
TARGETS = main idr-test multiorder xarray maple
-CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \
- slab.o maple.o
+LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o
+CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS)
OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \
iteration_check_2.o benchmark.o
diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c
deleted file mode 100644
index 66ec4a24a203..000000000000
--- a/tools/testing/radix-tree/bitmap.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/* lib/bitmap.c pulls in at least two other files. */
-
-#include <linux/bitmap.h>
-
-void bitmap_clear(unsigned long *map, unsigned int start, int len)
-{
- unsigned long *p = map + BIT_WORD(start);
- const unsigned int size = start + len;
- int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
- unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
-
- while (len - bits_to_clear >= 0) {
- *p &= ~mask_to_clear;
- len -= bits_to_clear;
- bits_to_clear = BITS_PER_LONG;
- mask_to_clear = ~0UL;
- p++;
- }
- if (len) {
- mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
- *p &= ~mask_to_clear;
- }
-}
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
index ca24f6839d50..84b8c3c92c79 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -424,6 +424,7 @@ void idr_checks(void)
#define module_init(x)
#define module_exit(x)
#define MODULE_AUTHOR(x)
+#define MODULE_DESCRIPTION(X)
#define MODULE_LICENSE(x)
#define dump_stack() assert(0)
void ida_dump(struct ida *);
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index f1caf4bcf937..cd1cf05503b4 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -19,6 +19,7 @@
#define module_init(x)
#define module_exit(x)
#define MODULE_AUTHOR(x)
+#define MODULE_DESCRIPTION(X)
#define MODULE_LICENSE(x)
#define dump_stack() assert(0)
diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c
index f20e12cbbfd4..d0e53bff1eb6 100644
--- a/tools/testing/radix-tree/xarray.c
+++ b/tools/testing/radix-tree/xarray.c
@@ -10,6 +10,7 @@
#define module_init(x)
#define module_exit(x)
#define MODULE_AUTHOR(x)
+#define MODULE_DESCRIPTION(X)
#define MODULE_LICENSE(x)
#define dump_stack() assert(0)
diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c
index 4c941270d8de..e4fa507cbdd0 100644
--- a/tools/testing/selftests/arm64/abi/ptrace.c
+++ b/tools/testing/selftests/arm64/abi/ptrace.c
@@ -156,7 +156,7 @@ static void test_hw_debug(pid_t child, int type, const char *type_name)
/* Zero is not currently architecturally valid */
ksft_test_result(arch, "%s_arch_set\n", type_name);
} else {
- ksft_test_result_skip("%s_arch_set\n");
+ ksft_test_result_skip("%s_arch_set\n", type_name);
}
}
diff --git a/tools/testing/selftests/bpf/DENYLIST.aarch64 b/tools/testing/selftests/bpf/DENYLIST.aarch64
index 3c7c3e79aa93..901349da680f 100644
--- a/tools/testing/selftests/bpf/DENYLIST.aarch64
+++ b/tools/testing/selftests/bpf/DENYLIST.aarch64
@@ -1,6 +1,5 @@
bpf_cookie/multi_kprobe_attach_api # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3
bpf_cookie/multi_kprobe_link_api # kprobe_multi_link_api_subtest:FAIL:fentry_raw_skel_load unexpected error: -3
-fexit_sleep # The test never returns. The remaining tests cannot start.
kprobe_multi_bench_attach # needs CONFIG_FPROBE
kprobe_multi_test # needs CONFIG_FPROBE
module_attach # prog 'kprobe_multi': failed to auto-attach: -95
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index dd49c1d23a60..81d4757ecd4c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp
# Make sure we are able to include and link libbpf against c++.
$(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
$(call msg,CXX,,$@)
- $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@
+ $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@
# Benchmark runner
$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ)
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c
index f949647dbbc2..552a0875ca6d 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c
@@ -21,13 +21,13 @@ static int do_sleep(void *skel)
}
#define STACK_SIZE (1024 * 1024)
-static char child_stack[STACK_SIZE];
void test_fexit_sleep(void)
{
struct fexit_sleep_lskel *fexit_skel = NULL;
int wstatus, duration = 0;
pid_t cpid;
+ char *child_stack = NULL;
int err, fexit_cnt;
fexit_skel = fexit_sleep_lskel__open_and_load();
@@ -38,6 +38,11 @@ void test_fexit_sleep(void)
if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
goto cleanup;
+ child_stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE |
+ MAP_ANONYMOUS | MAP_STACK, -1, 0);
+ if (!ASSERT_NEQ(child_stack, MAP_FAILED, "mmap"))
+ goto cleanup;
+
cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel);
if (CHECK(cpid == -1, "clone", "%s\n", strerror(errno)))
goto cleanup;
@@ -78,5 +83,6 @@ void test_fexit_sleep(void)
goto cleanup;
cleanup:
+ munmap(child_stack, STACK_SIZE);
fexit_sleep_lskel__destroy(fexit_skel);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index e91b59366030..9ce0e0e0b7da 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -29,6 +29,8 @@
#include "sockmap_helpers.h"
+#define NO_FLAGS 0
+
static void test_insert_invalid(struct test_sockmap_listen *skel __always_unused,
int family, int sotype, int mapfd)
{
@@ -1376,7 +1378,8 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
static void pairs_redir_to_connected(int cli0, int peer0, int cli1, int peer1,
int sock_mapfd, int nop_mapfd,
- int verd_mapfd, enum redir_mode mode)
+ int verd_mapfd, enum redir_mode mode,
+ int send_flags)
{
const char *log_prefix = redir_mode_str(mode);
unsigned int pass;
@@ -1396,12 +1399,11 @@ static void pairs_redir_to_connected(int cli0, int peer0, int cli1, int peer1,
return;
}
- n = write(cli1, "a", 1);
- if (n < 0)
- FAIL_ERRNO("%s: write", log_prefix);
- if (n == 0)
- FAIL("%s: incomplete write", log_prefix);
- if (n < 1)
+ /* Last byte is OOB data when send_flags has MSG_OOB bit set */
+ n = xsend(cli1, "ab", 2, send_flags);
+ if (n >= 0 && n < 2)
+ FAIL("%s: incomplete send", log_prefix);
+ if (n < 2)
return;
key = SK_PASS;
@@ -1416,6 +1418,25 @@ static void pairs_redir_to_connected(int cli0, int peer0, int cli1, int peer1,
FAIL_ERRNO("%s: recv_timeout", log_prefix);
if (n == 0)
FAIL("%s: incomplete recv", log_prefix);
+
+ if (send_flags & MSG_OOB) {
+ /* Check that we can't read OOB while in sockmap */
+ errno = 0;
+ n = recv(peer1, &b, 1, MSG_OOB | MSG_DONTWAIT);
+ if (n != -1 || errno != EOPNOTSUPP)
+ FAIL("%s: recv(MSG_OOB): expected EOPNOTSUPP: retval=%d errno=%d",
+ log_prefix, n, errno);
+
+ /* Remove peer1 from sockmap */
+ xbpf_map_delete_elem(sock_mapfd, &(int){ 1 });
+
+ /* Check that OOB was dropped on redirect */
+ errno = 0;
+ n = recv(peer1, &b, 1, MSG_OOB | MSG_DONTWAIT);
+ if (n != -1 || errno != EINVAL)
+ FAIL("%s: recv(MSG_OOB): expected EINVAL: retval=%d errno=%d",
+ log_prefix, n, errno);
+ }
}
static void unix_redir_to_connected(int sotype, int sock_mapfd,
@@ -1432,7 +1453,8 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd,
goto close0;
c1 = sfd[0], p1 = sfd[1];
- pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, mode);
+ pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd,
+ mode, NO_FLAGS);
xclose(c1);
xclose(p1);
@@ -1722,7 +1744,8 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
if (err)
goto close_cli0;
- pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, mode);
+ pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd,
+ mode, NO_FLAGS);
xclose(c1);
xclose(p1);
@@ -1780,7 +1803,8 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd,
if (err)
goto close;
- pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd, mode);
+ pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, -1, verd_mapfd,
+ mode, NO_FLAGS);
xclose(c1);
xclose(p1);
@@ -1815,10 +1839,9 @@ static void inet_unix_skb_redir_to_connected(struct test_sockmap_listen *skel,
xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
}
-static void unix_inet_redir_to_connected(int family, int type,
- int sock_mapfd, int nop_mapfd,
- int verd_mapfd,
- enum redir_mode mode)
+static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd,
+ int nop_mapfd, int verd_mapfd,
+ enum redir_mode mode, int send_flags)
{
int c0, c1, p0, p1;
int sfd[2];
@@ -1828,19 +1851,18 @@ static void unix_inet_redir_to_connected(int family, int type,
if (err)
return;
- if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd))
+ if (socketpair(AF_UNIX, type | SOCK_NONBLOCK, 0, sfd))
goto close_cli0;
c1 = sfd[0], p1 = sfd[1];
- pairs_redir_to_connected(c0, p0, c1, p1,
- sock_mapfd, nop_mapfd, verd_mapfd, mode);
+ pairs_redir_to_connected(c0, p0, c1, p1, sock_mapfd, nop_mapfd,
+ verd_mapfd, mode, send_flags);
xclose(c1);
xclose(p1);
close_cli0:
xclose(c0);
xclose(p0);
-
}
static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel,
@@ -1859,31 +1881,42 @@ static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel,
skel->bss->test_ingress = false;
unix_inet_redir_to_connected(family, SOCK_DGRAM,
sock_map, -1, verdict_map,
- REDIR_EGRESS);
+ REDIR_EGRESS, NO_FLAGS);
unix_inet_redir_to_connected(family, SOCK_DGRAM,
sock_map, -1, verdict_map,
- REDIR_EGRESS);
+ REDIR_EGRESS, NO_FLAGS);
unix_inet_redir_to_connected(family, SOCK_DGRAM,
sock_map, nop_map, verdict_map,
- REDIR_EGRESS);
+ REDIR_EGRESS, NO_FLAGS);
+ unix_inet_redir_to_connected(family, SOCK_STREAM,
+ sock_map, nop_map, verdict_map,
+ REDIR_EGRESS, NO_FLAGS);
+
+ /* MSG_OOB not supported by AF_UNIX SOCK_DGRAM */
unix_inet_redir_to_connected(family, SOCK_STREAM,
sock_map, nop_map, verdict_map,
- REDIR_EGRESS);
+ REDIR_EGRESS, MSG_OOB);
+
skel->bss->test_ingress = true;
unix_inet_redir_to_connected(family, SOCK_DGRAM,
sock_map, -1, verdict_map,
- REDIR_INGRESS);
+ REDIR_INGRESS, NO_FLAGS);
unix_inet_redir_to_connected(family, SOCK_STREAM,
sock_map, -1, verdict_map,
- REDIR_INGRESS);
+ REDIR_INGRESS, NO_FLAGS);
unix_inet_redir_to_connected(family, SOCK_DGRAM,
sock_map, nop_map, verdict_map,
- REDIR_INGRESS);
+ REDIR_INGRESS, NO_FLAGS);
+ unix_inet_redir_to_connected(family, SOCK_STREAM,
+ sock_map, nop_map, verdict_map,
+ REDIR_INGRESS, NO_FLAGS);
+
+ /* MSG_OOB not supported by AF_UNIX SOCK_DGRAM */
unix_inet_redir_to_connected(family, SOCK_STREAM,
sock_map, nop_map, verdict_map,
- REDIR_INGRESS);
+ REDIR_INGRESS, MSG_OOB);
xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index bd8c75b620c2..c397336fe1ed 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -216,7 +216,7 @@ static void test_uretprobe_regs_change(void)
}
#ifndef __NR_uretprobe
-#define __NR_uretprobe 467
+#define __NR_uretprobe 335
#endif
__naked unsigned long uretprobe_syscall_call_1(void)
@@ -253,7 +253,7 @@ static void test_uretprobe_syscall_call(void)
struct uprobe_syscall_executed *skel;
int pid, status, err, go[2], c;
- if (ASSERT_OK(pipe(go), "pipe"))
+ if (!ASSERT_OK(pipe(go), "pipe"))
return;
skel = uprobe_syscall_executed__open_and_load();
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index f76b5d67a3ee..c87ee2bf558c 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -68,7 +68,8 @@ static int open_xsk(int ifindex, struct xsk *xsk)
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
- .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM,
+ .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM |
+ XDP_UMEM_TX_METADATA_LEN,
.tx_metadata_len = sizeof(struct xsk_tx_metadata),
};
__u32 idx;
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c
index ba97165bdb28..a657651eba52 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c
@@ -14,9 +14,9 @@ typedef int *ptr_arr_t[6];
typedef int *ptr_multiarr_t[7][8][9][10];
-typedef int * (*fn_ptr_arr_t[11])();
+typedef int * (*fn_ptr_arr_t[11])(void);
-typedef int * (*fn_ptr_multiarr_t[12][13])();
+typedef int * (*fn_ptr_multiarr_t[12][13])(void);
struct root_struct {
arr_t _1;
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
index ad21ee8c7e23..29d01fff32bd 100644
--- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
@@ -100,7 +100,7 @@ typedef void (*printf_fn_t)(const char *, ...);
* `int -> char *` function and returns pointer to a char. Equivalent:
* typedef char * (*fn_input_t)(int);
* typedef char * (*fn_output_outer_t)(fn_input_t);
- * typedef const fn_output_outer_t (* fn_output_inner_t)();
+ * typedef const fn_output_outer_t (* fn_output_inner_t)(void);
* typedef const fn_output_inner_t fn_ptr_arr2_t[5];
*/
/* ----- START-EXPECTED-OUTPUT ----- */
@@ -127,7 +127,7 @@ typedef void (* (*signal_t)(int, void (*)(int)))(int);
typedef char * (*fn_ptr_arr1_t[10])(int **);
-typedef char * (* (* const fn_ptr_arr2_t[5])())(char * (*)(int));
+typedef char * (* (* const fn_ptr_arr2_t[5])(void))(char * (*)(int));
struct struct_w_typedefs {
int_t a;
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 16bdc3e25591..ef70b88bccb2 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -1432,4 +1432,58 @@ int iter_arr_with_actual_elem_count(const void *ctx)
return sum;
}
+__u32 upper, select_n, result;
+__u64 global;
+
+static __noinline bool nest_2(char *str)
+{
+ /* some insns (including branch insns) to ensure stacksafe() is triggered
+ * in nest_2(). This way, stacksafe() can compare frame associated with nest_1().
+ */
+ if (str[0] == 't')
+ return true;
+ if (str[1] == 'e')
+ return true;
+ if (str[2] == 's')
+ return true;
+ if (str[3] == 't')
+ return true;
+ return false;
+}
+
+static __noinline bool nest_1(int n)
+{
+ /* case 0: allocate stack, case 1: no allocate stack */
+ switch (n) {
+ case 0: {
+ char comm[16];
+
+ if (bpf_get_current_comm(comm, 16))
+ return false;
+ return nest_2(comm);
+ }
+ case 1:
+ return nest_2((char *)&global);
+ default:
+ return false;
+ }
+}
+
+SEC("raw_tp")
+__success
+int iter_subprog_check_stacksafe(const void *ctx)
+{
+ long i;
+
+ bpf_for(i, 0, upper) {
+ if (!nest_1(select_n)) {
+ result = 1;
+ return 0;
+ }
+ }
+
+ result = 2;
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config
index 97d549ee894f..39f979690dd3 100644
--- a/tools/testing/selftests/cgroup/config
+++ b/tools/testing/selftests/cgroup/config
@@ -3,5 +3,4 @@ CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_SCHED=y
CONFIG_MEMCG=y
-CONFIG_MEMCG_KMEM=y
CONFIG_PAGE_COUNTER=y
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index 991c473e3859..12b4eb9d0434 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -589,4 +589,39 @@ TEST(close_range_cloexec_unshare_syzbot)
EXPECT_EQ(close(fd3), 0);
}
+TEST(close_range_bitmap_corruption)
+{
+ pid_t pid;
+ int status;
+ struct __clone_args args = {
+ .flags = CLONE_FILES,
+ .exit_signal = SIGCHLD,
+ };
+
+ /* get the first 128 descriptors open */
+ for (int i = 2; i < 128; i++)
+ EXPECT_GE(dup2(0, i), 0);
+
+ /* get descriptor table shared */
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* unshare and truncate descriptor table down to 64 */
+ if (sys_close_range(64, ~0U, CLOSE_RANGE_UNSHARE))
+ exit(EXIT_FAILURE);
+
+ ASSERT_EQ(fcntl(64, F_GETFD), -1);
+ /* ... and verify that the range 64..127 is not
+ stuck "fully used" according to secondary bitmap */
+ EXPECT_EQ(dup(0), 64)
+ exit(EXIT_FAILURE);
+ exit(EXIT_SUCCESS);
+ }
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 29a22f50e762..1e2e98cc809d 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -4,7 +4,7 @@
TEST_GEN_FILES += huge_count_read_write
TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race
TEST_GEN_FILES += debugfs_target_ids_pid_leak
-TEST_GEN_FILES += access_memory
+TEST_GEN_FILES += access_memory access_memory_even
TEST_FILES = _chk_dependency.sh _debugfs_common.sh
@@ -13,6 +13,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
TEST_PROGS += sysfs.sh
TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py
TEST_PROGS += damos_quota.py damos_quota_goal.py damos_apply_interval.py
+TEST_PROGS += damos_tried_regions.py damon_nr_regions.py
TEST_PROGS += reclaim.sh lru_sort.sh
# regression tests (reproducers of previously found bugs)
diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py
index 2bd44c32be1b..6e136dc3df19 100644
--- a/tools/testing/selftests/damon/_damon_sysfs.py
+++ b/tools/testing/selftests/damon/_damon_sysfs.py
@@ -175,16 +175,24 @@ class DamosStats:
self.sz_applied = sz_applied
self.qt_exceeds = qt_exceeds
+class DamosTriedRegion:
+ def __init__(self, start, end, nr_accesses, age):
+ self.start = start
+ self.end = end
+ self.nr_accesses = nr_accesses
+ self.age = age
+
class Damos:
action = None
access_pattern = None
quota = None
apply_interval_us = None
- # todo: Support watermarks, stats, tried_regions
+ # todo: Support watermarks, stats
idx = None
context = None
tried_bytes = None
stats = None
+ tried_regions = None
def __init__(self, action='stat', access_pattern=DamosAccessPattern(),
quota=DamosQuota(), apply_interval_us=0):
@@ -398,6 +406,35 @@ class Kdamond:
err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on')
return err
+ def stop(self):
+ err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'off')
+ return err
+
+ def update_schemes_tried_regions(self):
+ err = write_file(os.path.join(self.sysfs_dir(), 'state'),
+ 'update_schemes_tried_regions')
+ if err is not None:
+ return err
+ for context in self.contexts:
+ for scheme in context.schemes:
+ tried_regions = []
+ tried_regions_dir = os.path.join(
+ scheme.sysfs_dir(), 'tried_regions')
+ for filename in os.listdir(
+ os.path.join(scheme.sysfs_dir(), 'tried_regions')):
+ tried_region_dir = os.path.join(tried_regions_dir, filename)
+ if not os.path.isdir(tried_region_dir):
+ continue
+ region_values = []
+ for f in ['start', 'end', 'nr_accesses', 'age']:
+ content, err = read_file(
+ os.path.join(tried_region_dir, f))
+ if err is not None:
+ return err
+ region_values.append(int(content))
+ tried_regions.append(DamosTriedRegion(*region_values))
+ scheme.tried_regions = tried_regions
+
def update_schemes_tried_bytes(self):
err = write_file(os.path.join(self.sysfs_dir(), 'state'),
'update_schemes_tried_bytes')
@@ -444,6 +481,25 @@ class Kdamond:
goal.effective_bytes = int(content)
return None
+ def commit(self):
+ nr_contexts_file = os.path.join(self.sysfs_dir(),
+ 'contexts', 'nr_contexts')
+ content, err = read_file(nr_contexts_file)
+ if err is not None:
+ return err
+ if int(content) != len(self.contexts):
+ err = write_file(nr_contexts_file, '%d' % len(self.contexts))
+ if err is not None:
+ return err
+
+ for context in self.contexts:
+ err = context.stage()
+ if err is not None:
+ return err
+ err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'commit')
+ return err
+
+
def commit_schemes_quota_goals(self):
for context in self.contexts:
for scheme in context.schemes:
@@ -478,3 +534,10 @@ class Kdamonds:
if err is not None:
return err
return None
+
+ def stop(self):
+ for kdamond in self.kdamonds:
+ err = kdamond.stop()
+ if err is not None:
+ return err
+ return None
diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c
index 585a2fa54329..56b17e8fe1be 100644
--- a/tools/testing/selftests/damon/access_memory.c
+++ b/tools/testing/selftests/damon/access_memory.c
@@ -35,7 +35,7 @@ int main(int argc, char *argv[])
start_clock = clock();
while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC <
access_time_ms)
- memset(regions[i], i, 1024 * 1024 * 10);
+ memset(regions[i], i, sz_region);
}
return 0;
}
diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c
new file mode 100644
index 000000000000..3be121487432
--- /dev/null
+++ b/tools/testing/selftests/damon/access_memory_even.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Artificial memory access program for testing DAMON.
+ *
+ * Receives number of regions and size of each region from user. Allocate the
+ * regions and repeatedly access even numbered (starting from zero) regions.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+int main(int argc, char *argv[])
+{
+ char **regions;
+ clock_t start_clock;
+ int nr_regions;
+ int sz_region;
+ int access_time_ms;
+ int i;
+
+ if (argc != 3) {
+ printf("Usage: %s <number> <size (bytes)>\n", argv[0]);
+ return -1;
+ }
+
+ nr_regions = atoi(argv[1]);
+ sz_region = atoi(argv[2]);
+
+ regions = malloc(sizeof(*regions) * nr_regions);
+ for (i = 0; i < nr_regions; i++)
+ regions[i] = malloc(sz_region);
+
+ while (1) {
+ for (i = 0; i < nr_regions; i++) {
+ if (i % 2 == 0)
+ memset(regions[i], i, sz_region);
+ }
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/damon/damon_nr_regions.py b/tools/testing/selftests/damon/damon_nr_regions.py
new file mode 100644
index 000000000000..2e8a74aff543
--- /dev/null
+++ b/tools/testing/selftests/damon/damon_nr_regions.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def test_nr_regions(real_nr_regions, min_nr_regions, max_nr_regions):
+ '''
+ Create process of the given 'real_nr_regions' regions, monitor it using
+ DAMON with given '{min,max}_nr_regions' monitoring parameter.
+
+ Exit with non-zero return code if the given {min,max}_nr_regions is not
+ kept.
+ '''
+ sz_region = 10 * 1024 * 1024
+ proc = subprocess.Popen(['./access_memory_even', '%d' % real_nr_regions,
+ '%d' % sz_region])
+
+ # stat every monitored regions
+ kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+ contexts=[_damon_sysfs.DamonCtx(
+ monitoring_attrs=_damon_sysfs.DamonAttrs(
+ min_nr_regions=min_nr_regions,
+ max_nr_regions=max_nr_regions),
+ ops='vaddr',
+ targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+ schemes=[_damon_sysfs.Damos(action='stat',
+ )] # schemes
+ )] # contexts
+ )]) # kdamonds
+
+ err = kdamonds.start()
+ if err is not None:
+ proc.terminate()
+ print('kdamond start failed: %s' % err)
+ exit(1)
+
+ collected_nr_regions = []
+ while proc.poll() is None:
+ time.sleep(0.1)
+ err = kdamonds.kdamonds[0].update_schemes_tried_regions()
+ if err is not None:
+ proc.terminate()
+ print('tried regions update failed: %s' % err)
+ exit(1)
+
+ scheme = kdamonds.kdamonds[0].contexts[0].schemes[0]
+ if scheme.tried_regions is None:
+ proc.terminate()
+ print('tried regions is not collected')
+ exit(1)
+
+ nr_tried_regions = len(scheme.tried_regions)
+ if nr_tried_regions <= 0:
+ proc.terminate()
+ print('tried regions is not created')
+ exit(1)
+ collected_nr_regions.append(nr_tried_regions)
+ if len(collected_nr_regions) > 10:
+ break
+ proc.terminate()
+ kdamonds.stop()
+
+ test_name = 'nr_regions test with %d/%d/%d real/min/max nr_regions' % (
+ real_nr_regions, min_nr_regions, max_nr_regions)
+ if (collected_nr_regions[0] < min_nr_regions or
+ collected_nr_regions[-1] > max_nr_regions):
+ print('fail %s' % test_name)
+ print('number of regions that collected are:')
+ for nr in collected_nr_regions:
+ print(nr)
+ exit(1)
+ print('pass %s ' % test_name)
+
+def main():
+ # test min_nr_regions larger than real nr regions
+ test_nr_regions(10, 20, 100)
+
+ # test max_nr_regions smaller than real nr regions
+ test_nr_regions(15, 3, 10)
+
+ # test online-tuned max_nr_regions that smaller than real nr regions
+ sz_region = 10 * 1024 * 1024
+ proc = subprocess.Popen(['./access_memory_even', '14', '%d' % sz_region])
+
+ # stat every monitored regions
+ kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+ contexts=[_damon_sysfs.DamonCtx(
+ monitoring_attrs=_damon_sysfs.DamonAttrs(
+ min_nr_regions=10, max_nr_regions=1000),
+ ops='vaddr',
+ targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+ schemes=[_damon_sysfs.Damos(action='stat',
+ )] # schemes
+ )] # contexts
+ )]) # kdamonds
+
+ err = kdamonds.start()
+ if err is not None:
+ proc.terminate()
+ print('kdamond start failed: %s' % err)
+ exit(1)
+
+ # wait until the real regions are found
+ time.sleep(3)
+
+ attrs = kdamonds.kdamonds[0].contexts[0].monitoring_attrs
+ attrs.min_nr_regions = 3
+ attrs.max_nr_regions = 7
+ err = kdamonds.kdamonds[0].commit()
+ if err is not None:
+ proc.terminate()
+ print('commit failed: %s' % err)
+ exit(1)
+ # wait for next merge operation is executed
+ time.sleep(0.3)
+
+ err = kdamonds.kdamonds[0].update_schemes_tried_regions()
+ if err is not None:
+ proc.terminate()
+ print('tried regions update failed: %s' % err)
+ exit(1)
+
+ scheme = kdamonds.kdamonds[0].contexts[0].schemes[0]
+ if scheme.tried_regions is None:
+ proc.terminate()
+ print('tried regions is not collected')
+ exit(1)
+
+ nr_tried_regions = len(scheme.tried_regions)
+ if nr_tried_regions <= 0:
+ proc.terminate()
+ print('tried regions is not created')
+ exit(1)
+ proc.terminate()
+
+ if nr_tried_regions > 7:
+ print('fail online-tuned max_nr_regions: %d > 7' % nr_tried_regions)
+ exit(1)
+ print('pass online-tuned max_nr_regions')
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/testing/selftests/damon/damos_tried_regions.py b/tools/testing/selftests/damon/damos_tried_regions.py
new file mode 100644
index 000000000000..3b347eb28bd2
--- /dev/null
+++ b/tools/testing/selftests/damon/damos_tried_regions.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import time
+
+import _damon_sysfs
+
+def main():
+ # repeatedly access even-numbered ones in 14 regions of 10 MiB size
+ sz_region = 10 * 1024 * 1024
+ proc = subprocess.Popen(['./access_memory_even', '14', '%d' % sz_region])
+
+ # stat every monitored regions
+ kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
+ contexts=[_damon_sysfs.DamonCtx(
+ ops='vaddr',
+ targets=[_damon_sysfs.DamonTarget(pid=proc.pid)],
+ schemes=[_damon_sysfs.Damos(action='stat',
+ )] # schemes
+ )] # contexts
+ )]) # kdamonds
+
+ err = kdamonds.start()
+ if err is not None:
+ proc.terminate()
+ print('kdamond start failed: %s' % err)
+ exit(1)
+
+ collected_nr_regions = []
+ while proc.poll() is None:
+ time.sleep(0.1)
+ err = kdamonds.kdamonds[0].update_schemes_tried_regions()
+ if err is not None:
+ proc.terminate()
+ print('tried regions update failed: %s' % err)
+ exit(1)
+
+ scheme = kdamonds.kdamonds[0].contexts[0].schemes[0]
+ if scheme.tried_regions is None:
+ proc.terminate()
+ print('tried regions is not collected')
+ exit(1)
+
+ nr_tried_regions = len(scheme.tried_regions)
+ if nr_tried_regions <= 0:
+ proc.terminate()
+ print('tried regions is not created')
+ exit(1)
+ collected_nr_regions.append(nr_tried_regions)
+ if len(collected_nr_regions) > 10:
+ break
+ proc.terminate()
+
+ collected_nr_regions.sort()
+ sample = collected_nr_regions[4]
+ print('50-th percentile nr_regions: %d' % sample)
+ print('expectation (>= 14) is %s' % 'met' if sample >= 14 else 'not met')
+ if collected_nr_regions[4] < 14:
+ print('full nr_regions:')
+ print('\n'.join(collected_nr_regions))
+ exit(1)
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
index 5f541522364f..5d0a809dc2df 100644
--- a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
+++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
@@ -29,9 +29,11 @@ static int check_vgem(int fd)
version.name = name;
ret = ioctl(fd, DRM_IOCTL_VERSION, &version);
- if (ret)
+ if (ret || version.name_len != 4)
return 0;
+ name[4] = '\0';
+
return !strcmp(name, "vgem");
}
diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c
index c812080e304e..6062723a172e 100644
--- a/tools/testing/selftests/drivers/dma-buf/udmabuf.c
+++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c
@@ -9,52 +9,162 @@
#include <errno.h>
#include <fcntl.h>
#include <malloc.h>
+#include <stdbool.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
+#include <sys/mman.h>
#include <linux/memfd.h>
#include <linux/udmabuf.h>
+#include "../../kselftest.h"
#define TEST_PREFIX "drivers/dma-buf/udmabuf"
#define NUM_PAGES 4
+#define NUM_ENTRIES 4
+#define MEMFD_SIZE 1024 /* in pages */
-static int memfd_create(const char *name, unsigned int flags)
+static unsigned int page_size;
+
+static int create_memfd_with_seals(off64_t size, bool hpage)
+{
+ int memfd, ret;
+ unsigned int flags = MFD_ALLOW_SEALING;
+
+ if (hpage)
+ flags |= MFD_HUGETLB;
+
+ memfd = memfd_create("udmabuf-test", flags);
+ if (memfd < 0) {
+ ksft_print_msg("%s: [skip,no-memfd]\n", TEST_PREFIX);
+ exit(KSFT_SKIP);
+ }
+
+ ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+ if (ret < 0) {
+ ksft_print_msg("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+ exit(KSFT_SKIP);
+ }
+
+ ret = ftruncate(memfd, size);
+ if (ret == -1) {
+ ksft_print_msg("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+ exit(KSFT_FAIL);
+ }
+
+ return memfd;
+}
+
+static int create_udmabuf_list(int devfd, int memfd, off64_t memfd_size)
+{
+ struct udmabuf_create_list *list;
+ int ubuf_fd, i;
+
+ list = malloc(sizeof(struct udmabuf_create_list) +
+ sizeof(struct udmabuf_create_item) * NUM_ENTRIES);
+ if (!list) {
+ ksft_print_msg("%s: [FAIL, udmabuf-malloc]\n", TEST_PREFIX);
+ exit(KSFT_FAIL);
+ }
+
+ for (i = 0; i < NUM_ENTRIES; i++) {
+ list->list[i].memfd = memfd;
+ list->list[i].offset = i * (memfd_size / NUM_ENTRIES);
+ list->list[i].size = getpagesize() * NUM_PAGES;
+ }
+
+ list->count = NUM_ENTRIES;
+ list->flags = UDMABUF_FLAGS_CLOEXEC;
+ ubuf_fd = ioctl(devfd, UDMABUF_CREATE_LIST, list);
+ free(list);
+ if (ubuf_fd < 0) {
+ ksft_print_msg("%s: [FAIL, udmabuf-create]\n", TEST_PREFIX);
+ exit(KSFT_FAIL);
+ }
+
+ return ubuf_fd;
+}
+
+static void write_to_memfd(void *addr, off64_t size, char chr)
+{
+ int i;
+
+ for (i = 0; i < size / page_size; i++) {
+ *((char *)addr + (i * page_size)) = chr;
+ }
+}
+
+static void *mmap_fd(int fd, off64_t size)
+{
+ void *addr;
+
+ addr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if (addr == MAP_FAILED) {
+ ksft_print_msg("%s: ubuf_fd mmap fail\n", TEST_PREFIX);
+ exit(KSFT_FAIL);
+ }
+
+ return addr;
+}
+
+static int compare_chunks(void *addr1, void *addr2, off64_t memfd_size)
{
- return syscall(__NR_memfd_create, name, flags);
+ off64_t off;
+ int i = 0, j, k = 0, ret = 0;
+ char char1, char2;
+
+ while (i < NUM_ENTRIES) {
+ off = i * (memfd_size / NUM_ENTRIES);
+ for (j = 0; j < NUM_PAGES; j++, k++) {
+ char1 = *((char *)addr1 + off + (j * getpagesize()));
+ char2 = *((char *)addr2 + (k * getpagesize()));
+ if (char1 != char2) {
+ ret = -1;
+ goto err;
+ }
+ }
+ i++;
+ }
+err:
+ munmap(addr1, memfd_size);
+ munmap(addr2, NUM_ENTRIES * NUM_PAGES * getpagesize());
+ return ret;
}
int main(int argc, char *argv[])
{
struct udmabuf_create create;
int devfd, memfd, buf, ret;
- off_t size;
- void *mem;
+ off64_t size;
+ void *addr1, *addr2;
+
+ ksft_print_header();
+ ksft_set_plan(6);
devfd = open("/dev/udmabuf", O_RDWR);
if (devfd < 0) {
- printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
- TEST_PREFIX);
- exit(77);
+ ksft_print_msg(
+ "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+ TEST_PREFIX);
+ exit(KSFT_SKIP);
}
memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
if (memfd < 0) {
- printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
- exit(77);
+ ksft_print_msg("%s: [skip,no-memfd]\n", TEST_PREFIX);
+ exit(KSFT_SKIP);
}
ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
if (ret < 0) {
- printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
- exit(77);
+ ksft_print_msg("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+ exit(KSFT_SKIP);
}
-
size = getpagesize() * NUM_PAGES;
ret = ftruncate(memfd, size);
if (ret == -1) {
- printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
- exit(1);
+ ksft_print_msg("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+ exit(KSFT_FAIL);
}
memset(&create, 0, sizeof(create));
@@ -64,44 +174,86 @@ int main(int argc, char *argv[])
create.offset = getpagesize()/2;
create.size = getpagesize();
buf = ioctl(devfd, UDMABUF_CREATE, &create);
- if (buf >= 0) {
- printf("%s: [FAIL,test-1]\n", TEST_PREFIX);
- exit(1);
- }
+ if (buf >= 0)
+ ksft_test_result_fail("%s: [FAIL,test-1]\n", TEST_PREFIX);
+ else
+ ksft_test_result_pass("%s: [PASS,test-1]\n", TEST_PREFIX);
/* should fail (size not multiple of page) */
create.memfd = memfd;
create.offset = 0;
create.size = getpagesize()/2;
buf = ioctl(devfd, UDMABUF_CREATE, &create);
- if (buf >= 0) {
- printf("%s: [FAIL,test-2]\n", TEST_PREFIX);
- exit(1);
- }
+ if (buf >= 0)
+ ksft_test_result_fail("%s: [FAIL,test-2]\n", TEST_PREFIX);
+ else
+ ksft_test_result_pass("%s: [PASS,test-2]\n", TEST_PREFIX);
/* should fail (not memfd) */
create.memfd = 0; /* stdin */
create.offset = 0;
create.size = size;
buf = ioctl(devfd, UDMABUF_CREATE, &create);
- if (buf >= 0) {
- printf("%s: [FAIL,test-3]\n", TEST_PREFIX);
- exit(1);
- }
+ if (buf >= 0)
+ ksft_test_result_fail("%s: [FAIL,test-3]\n", TEST_PREFIX);
+ else
+ ksft_test_result_pass("%s: [PASS,test-3]\n", TEST_PREFIX);
/* should work */
+ page_size = getpagesize();
+ addr1 = mmap_fd(memfd, size);
+ write_to_memfd(addr1, size, 'a');
create.memfd = memfd;
create.offset = 0;
create.size = size;
buf = ioctl(devfd, UDMABUF_CREATE, &create);
- if (buf < 0) {
- printf("%s: [FAIL,test-4]\n", TEST_PREFIX);
- exit(1);
- }
+ if (buf < 0)
+ ksft_test_result_fail("%s: [FAIL,test-4]\n", TEST_PREFIX);
+ else
+ ksft_test_result_pass("%s: [PASS,test-4]\n", TEST_PREFIX);
+
+ munmap(addr1, size);
+ close(buf);
+ close(memfd);
+
+ /* should work (migration of 4k size pages)*/
+ size = MEMFD_SIZE * page_size;
+ memfd = create_memfd_with_seals(size, false);
+ addr1 = mmap_fd(memfd, size);
+ write_to_memfd(addr1, size, 'a');
+ buf = create_udmabuf_list(devfd, memfd, size);
+ addr2 = mmap_fd(buf, NUM_PAGES * NUM_ENTRIES * getpagesize());
+ write_to_memfd(addr1, size, 'b');
+ ret = compare_chunks(addr1, addr2, size);
+ if (ret < 0)
+ ksft_test_result_fail("%s: [FAIL,test-5]\n", TEST_PREFIX);
+ else
+ ksft_test_result_pass("%s: [PASS,test-5]\n", TEST_PREFIX);
+
+ close(buf);
+ close(memfd);
+
+ /* should work (migration of 2MB size huge pages)*/
+ page_size = getpagesize() * 512; /* 2 MB */
+ size = MEMFD_SIZE * page_size;
+ memfd = create_memfd_with_seals(size, true);
+ addr1 = mmap_fd(memfd, size);
+ write_to_memfd(addr1, size, 'a');
+ buf = create_udmabuf_list(devfd, memfd, size);
+ addr2 = mmap_fd(buf, NUM_PAGES * NUM_ENTRIES * getpagesize());
+ write_to_memfd(addr1, size, 'b');
+ ret = compare_chunks(addr1, addr2, size);
+ if (ret < 0)
+ ksft_test_result_fail("%s: [FAIL,test-6]\n", TEST_PREFIX);
+ else
+ ksft_test_result_pass("%s: [PASS,test-6]\n", TEST_PREFIX);
- fprintf(stderr, "%s: ok\n", TEST_PREFIX);
close(buf);
close(memfd);
close(devfd);
+
+ ksft_print_msg("%s: ok\n", TEST_PREFIX);
+ ksft_print_cnts();
+
return 0;
}
diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index 931dbc36ca43..011508ca604b 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -19,6 +19,15 @@ def _rss_key_rand(length):
return [random.randint(0, 255) for _ in range(length)]
+def _rss_key_check(cfg, data=None, context=0):
+ if data is None:
+ data = get_rss(cfg, context=context)
+ if 'rss-hash-key' not in data:
+ return
+ non_zero = [x for x in data['rss-hash-key'] if x != 0]
+ ksft_eq(bool(non_zero), True, comment=f"RSS key is all zero {data['rss-hash-key']}")
+
+
def get_rss(cfg, context=0):
return ethtool(f"-x {cfg.ifname} context {context}", json=True)[0]
@@ -90,8 +99,9 @@ def _send_traffic_check(cfg, port, name, params):
def test_rss_key_indir(cfg):
"""Test basics like updating the main RSS key and indirection table."""
- if len(_get_rx_cnts(cfg)) < 2:
- KsftSkipEx("Device has only one queue (or doesn't support queue stats)")
+ qcnt = len(_get_rx_cnts(cfg))
+ if qcnt < 3:
+ KsftSkipEx("Device has fewer than 3 queues (or doesn't support queue stats)")
data = get_rss(cfg)
want_keys = ['rss-hash-key', 'rss-hash-function', 'rss-indirection-table']
@@ -101,6 +111,7 @@ def test_rss_key_indir(cfg):
if not data[k]:
raise KsftFailEx(f"ethtool results empty for '{k}': {data[k]}")
+ _rss_key_check(cfg, data=data)
key_len = len(data['rss-hash-key'])
# Set the key
@@ -110,9 +121,26 @@ def test_rss_key_indir(cfg):
data = get_rss(cfg)
ksft_eq(key, data['rss-hash-key'])
+ # Set the indirection table and the key together
+ key = _rss_key_rand(key_len)
+ ethtool(f"-X {cfg.ifname} equal 3 hkey " + _rss_key_str(key))
+ reset_indir = defer(ethtool, f"-X {cfg.ifname} default")
+
+ data = get_rss(cfg)
+ _rss_key_check(cfg, data=data)
+ ksft_eq(0, min(data['rss-indirection-table']))
+ ksft_eq(2, max(data['rss-indirection-table']))
+
+ # Reset indirection table and set the key
+ key = _rss_key_rand(key_len)
+ ethtool(f"-X {cfg.ifname} default hkey " + _rss_key_str(key))
+ data = get_rss(cfg)
+ _rss_key_check(cfg, data=data)
+ ksft_eq(0, min(data['rss-indirection-table']))
+ ksft_eq(qcnt - 1, max(data['rss-indirection-table']))
+
# Set the indirection table
ethtool(f"-X {cfg.ifname} equal 2")
- reset_indir = defer(ethtool, f"-X {cfg.ifname} default")
data = get_rss(cfg)
ksft_eq(0, min(data['rss-indirection-table']))
ksft_eq(1, max(data['rss-indirection-table']))
@@ -317,8 +345,11 @@ def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None):
ctx_cnt = i
break
+ _rss_key_check(cfg, context=ctx_id)
+
if not create_with_cfg:
ethtool(f"-X {cfg.ifname} context {ctx_id} {want_cfg}")
+ _rss_key_check(cfg, context=ctx_id)
# Sanity check the context we just created
data = get_rss(cfg, ctx_id)
diff --git a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh
index 877cd6df94a1..fe905a7f34b3 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/ethtool_lanes.sh
@@ -2,6 +2,7 @@
# SPDX-License-Identifier: GPL-2.0
lib_dir=$(dirname $0)/../../../net/forwarding
+ethtool_lib_dir=$(dirname $0)/../hw
ALL_TESTS="
autoneg
@@ -11,7 +12,7 @@ ALL_TESTS="
NUM_NETIFS=2
: ${TIMEOUT:=30000} # ms
source $lib_dir/lib.sh
-source $lib_dir/ethtool_lib.sh
+source $ethtool_lib_dir/ethtool_lib.sh
setup_prepare()
{
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index ab67d58cfab7..ba012bc5aab9 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS = -Wall
CFLAGS += -Wno-nonnull
-CFLAGS += -D_GNU_SOURCE
ALIGNS := 0x1000 0x200000 0x1000000
ALIGN_PIES := $(patsubst %,load_address.%,$(ALIGNS))
diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
index f142a137526c..85acb4e3ef00 100644
--- a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
+++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c
@@ -13,6 +13,8 @@
#include <sys/eventfd.h>
#include "../../kselftest_harness.h"
+#define EVENTFD_TEST_ITERATIONS 100000UL
+
struct error {
int code;
char msg[512];
@@ -40,7 +42,7 @@ static inline int sys_eventfd2(unsigned int count, int flags)
return syscall(__NR_eventfd2, count, flags);
}
-TEST(eventfd01)
+TEST(eventfd_check_flag_rdwr)
{
int fd, flags;
@@ -54,7 +56,7 @@ TEST(eventfd01)
close(fd);
}
-TEST(eventfd02)
+TEST(eventfd_check_flag_cloexec)
{
int fd, flags;
@@ -68,7 +70,7 @@ TEST(eventfd02)
close(fd);
}
-TEST(eventfd03)
+TEST(eventfd_check_flag_nonblock)
{
int fd, flags;
@@ -83,7 +85,7 @@ TEST(eventfd03)
close(fd);
}
-TEST(eventfd04)
+TEST(eventfd_chek_flag_cloexec_and_nonblock)
{
int fd, flags;
@@ -161,7 +163,7 @@ static int verify_fdinfo(int fd, struct error *err, const char *prefix,
return 0;
}
-TEST(eventfd05)
+TEST(eventfd_check_flag_semaphore)
{
struct error err = {0};
int fd, ret;
@@ -183,4 +185,128 @@ TEST(eventfd05)
close(fd);
}
+/*
+ * A write(2) fails with the error EINVAL if the size of the supplied buffer
+ * is less than 8 bytes, or if an attempt is made to write the value
+ * 0xffffffffffffffff.
+ */
+TEST(eventfd_check_write)
+{
+ uint64_t value = 1;
+ ssize_t size;
+ int fd;
+
+ fd = sys_eventfd2(0, 0);
+ ASSERT_GE(fd, 0);
+
+ size = write(fd, &value, sizeof(int));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+
+ value = (uint64_t)-1;
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ close(fd);
+}
+
+/*
+ * A read(2) fails with the error EINVAL if the size of the supplied buffer is
+ * less than 8 bytes.
+ */
+TEST(eventfd_check_read)
+{
+ uint64_t value;
+ ssize_t size;
+ int fd;
+
+ fd = sys_eventfd2(1, 0);
+ ASSERT_GE(fd, 0);
+
+ size = read(fd, &value, sizeof(int));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ EXPECT_EQ(value, 1);
+
+ close(fd);
+}
+
+
+/*
+ * If EFD_SEMAPHORE was not specified and the eventfd counter has a nonzero
+ * value, then a read(2) returns 8 bytes containing that value, and the
+ * counter's value is reset to zero.
+ * If the eventfd counter is zero at the time of the call to read(2), then the
+ * call fails with the error EAGAIN if the file descriptor has been made nonblocking.
+ */
+TEST(eventfd_check_read_with_nonsemaphore)
+{
+ uint64_t value;
+ ssize_t size;
+ int fd;
+ int i;
+
+ fd = sys_eventfd2(0, EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ value = 1;
+ for (i = 0; i < EVENTFD_TEST_ITERATIONS; i++) {
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ }
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(uint64_t));
+ EXPECT_EQ(value, EVENTFD_TEST_ITERATIONS);
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EAGAIN);
+
+ close(fd);
+}
+
+/*
+ * If EFD_SEMAPHORE was specified and the eventfd counter has a nonzero value,
+ * then a read(2) returns 8 bytes containing the value 1, and the counter's
+ * value is decremented by 1.
+ * If the eventfd counter is zero at the time of the call to read(2), then the
+ * call fails with the error EAGAIN if the file descriptor has been made nonblocking.
+ */
+TEST(eventfd_check_read_with_semaphore)
+{
+ uint64_t value;
+ ssize_t size;
+ int fd;
+ int i;
+
+ fd = sys_eventfd2(0, EFD_SEMAPHORE|EFD_NONBLOCK);
+ ASSERT_GE(fd, 0);
+
+ value = 1;
+ for (i = 0; i < EVENTFD_TEST_ITERATIONS; i++) {
+ size = write(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ }
+
+ for (i = 0; i < EVENTFD_TEST_ITERATIONS; i++) {
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, sizeof(value));
+ EXPECT_EQ(value, 1);
+ }
+
+ size = read(fd, &value, sizeof(value));
+ EXPECT_EQ(size, -1);
+ EXPECT_EQ(errno, EAGAIN);
+
+ close(fd);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
index 994fa3468f17..f79f9bac7918 100644
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
INCLUDES := -I../include -I../../ $(KHDR_INCLUDES)
-CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE= -pthread $(INCLUDES) $(KHDR_INCLUDES)
+CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES)
LDLIBS := -lpthread -lrt
LOCAL_HDRS := \
diff --git a/tools/testing/selftests/hid/hid_bpf.c b/tools/testing/selftests/hid/hid_bpf.c
index dc0408a831d0..75b7b4ef6cfa 100644
--- a/tools/testing/selftests/hid/hid_bpf.c
+++ b/tools/testing/selftests/hid/hid_bpf.c
@@ -532,6 +532,7 @@ static void load_programs(const struct test_program programs[],
FIXTURE_DATA(hid_bpf) * self,
const FIXTURE_VARIANT(hid_bpf) * variant)
{
+ struct bpf_map *iter_map;
int err = -EINVAL;
ASSERT_LE(progs_count, ARRAY_SIZE(self->hid_links))
@@ -564,6 +565,13 @@ static void load_programs(const struct test_program programs[],
*ops_hid_id = self->hid_id;
}
+ /* we disable the auto-attach feature of all maps because we
+ * only want the tested one to be manually attached in the next
+ * call to bpf_map__attach_struct_ops()
+ */
+ bpf_object__for_each_map(iter_map, *self->skel->skeleton->obj)
+ bpf_map__set_autoattach(iter_map, false);
+
err = hid__load(self->skel);
ASSERT_OK(err) TH_LOG("hid_skel_load failed: %d", err);
@@ -687,6 +695,24 @@ TEST_F(hid_bpf, subprog_raw_event)
}
/*
+ * Attach hid_first_event to the given uhid device,
+ * attempt at re-attaching it, we should not lock and
+ * return an invalid struct bpf_link
+ */
+TEST_F(hid_bpf, multiple_attach)
+{
+ const struct test_program progs[] = {
+ { .name = "hid_first_event" },
+ };
+ struct bpf_link *link;
+
+ LOAD_PROGRAMS(progs);
+
+ link = bpf_map__attach_struct_ops(self->skel->maps.first_event);
+ ASSERT_NULL(link) TH_LOG("unexpected return value when re-attaching the struct_ops");
+}
+
+/*
* Ensures that we can attach/detach programs
*/
TEST_F(hid_bpf, test_attach_detach)
diff --git a/tools/testing/selftests/hid/progs/hid.c b/tools/testing/selftests/hid/progs/hid.c
index ee9bbbcf751b..5ecc845ef792 100644
--- a/tools/testing/selftests/hid/progs/hid.c
+++ b/tools/testing/selftests/hid/progs/hid.c
@@ -455,7 +455,7 @@ struct {
__type(value, struct elem);
} hmap SEC(".maps");
-static int wq_cb_sleepable(void *map, int *key, struct bpf_wq *work)
+static int wq_cb_sleepable(void *map, int *key, void *work)
{
__u8 buf[9] = {2, 3, 4, 5, 6, 7, 8, 9, 10};
struct hid_bpf_ctx *hid_ctx;
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
index cfe37f491906..e5db897586bb 100644
--- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -114,7 +114,7 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx,
extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
- int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
+ int (callback_fn)(void *map, int *key, void *wq),
unsigned int flags__k, void *aux__ign) __ksym;
#define bpf_wq_set_callback(timer, cb, flags) \
bpf_wq_set_callback_impl(timer, cb, flags, NULL)
diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile
index 05d66ef50c97..f45372cb00fe 100644
--- a/tools/testing/selftests/intel_pstate/Makefile
+++ b/tools/testing/selftests/intel_pstate/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE
+CFLAGS := $(CFLAGS) -Wall
LDLIBS += -lm
ARCH ?= $(shell uname -m 2>/dev/null || echo not)
diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile
index 32c5fdfd0eef..fd6477911f24 100644
--- a/tools/testing/selftests/iommu/Makefile
+++ b/tools/testing/selftests/iommu/Makefile
@@ -2,8 +2,6 @@
CFLAGS += -Wall -O2 -Wno-unused-function
CFLAGS += $(KHDR_INCLUDES)
-CFLAGS += -D_GNU_SOURCE
-
TEST_GEN_PROGS :=
TEST_GEN_PROGS += iommufd
TEST_GEN_PROGS += iommufd_fail_nth
diff --git a/tools/testing/selftests/kselftest/ksft.py b/tools/testing/selftests/kselftest/ksft.py
index cd89fb2bc10e..bf215790a89d 100644
--- a/tools/testing/selftests/kselftest/ksft.py
+++ b/tools/testing/selftests/kselftest/ksft.py
@@ -70,7 +70,7 @@ def test_result(condition, description=""):
def finished():
- if ksft_cnt["pass"] == ksft_num_tests:
+ if ksft_cnt["pass"] + ksft_cnt["skip"] == ksft_num_tests:
exit_code = KSFT_PASS
else:
exit_code = KSFT_FAIL
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index ac280dcba996..48d32c5aa3eb 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -112,6 +112,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
+TEST_GEN_PROGS_x86_64 += x86_64/apic_bus_clock_test
TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test
TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test
TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test
@@ -145,6 +146,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test
TEST_GEN_PROGS_x86_64 += steal_time
TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
TEST_GEN_PROGS_x86_64 += system_counter_offset_test
+TEST_GEN_PROGS_x86_64 += pre_fault_memory_test
# Compiled outputs used by test targets
TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test
@@ -231,7 +233,7 @@ LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
endif
CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
- -D_GNU_SOURCE -fno-builtin-memcmp -fno-builtin-memcpy \
+ -fno-builtin-memcmp -fno-builtin-memcpy \
-fno-builtin-memset -fno-builtin-strnlen \
-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
index 709d7d721760..4abebde78187 100644
--- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
@@ -32,13 +32,13 @@ static struct feature_id_reg feat_id_regs[] = {
{
ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */
ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */
- 4,
+ 8,
1
},
{
ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */
ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */
- 4,
+ 8,
1
}
};
diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
index a7de39fa2a0a..d20981663831 100644
--- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c
@@ -219,6 +219,7 @@ static void guest_code(void)
GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1);
GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1);
GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1);
+ GUEST_REG_SYNC(SYS_CTR_EL0);
GUEST_DONE();
}
@@ -490,11 +491,25 @@ static void test_clidr(struct kvm_vcpu *vcpu)
test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr;
}
+static void test_ctr(struct kvm_vcpu *vcpu)
+{
+ u64 ctr;
+
+ vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), &ctr);
+ ctr &= ~CTR_EL0_DIC_MASK;
+ if (ctr & CTR_EL0_IminLine_MASK)
+ ctr--;
+
+ vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CTR_EL0), ctr);
+ test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr;
+}
+
static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu)
{
u64 val;
test_clidr(vcpu);
+ test_ctr(vcpu);
vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val);
val++;
@@ -524,7 +539,9 @@ static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu)
for (int i = 0; i < ARRAY_SIZE(test_regs); i++)
test_assert_id_reg_unchanged(vcpu, test_regs[i].reg);
+ test_assert_id_reg_unchanged(vcpu, SYS_MPIDR_EL1);
test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1);
+ test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0);
ksft_test_result_pass("%s\n", __func__);
}
diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h
index bed316fdecd5..0f268b55fa06 100644
--- a/tools/testing/selftests/kvm/include/x86_64/apic.h
+++ b/tools/testing/selftests/kvm/include/x86_64/apic.h
@@ -60,6 +60,14 @@
#define APIC_VECTOR_MASK 0x000FF
#define APIC_ICR2 0x310
#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define APIC_LVTT 0x320
+#define APIC_LVT_TIMER_ONESHOT (0 << 17)
+#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
+#define APIC_LVT_MASKED (1 << 16)
+#define APIC_TMICT 0x380
+#define APIC_TMCCT 0x390
+#define APIC_TDCR 0x3E0
void apic_disable(void);
void xapic_enable(void);
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index c0c7c1fe93f9..a0c1440017bb 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -23,6 +23,7 @@
extern bool host_cpu_is_intel;
extern bool host_cpu_is_amd;
+extern uint64_t guest_tsc_khz;
/* Forced emulation prefix, used to invoke the emulator unconditionally. */
#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
@@ -816,6 +817,23 @@ static inline void cpu_relax(void)
asm volatile("rep; nop" ::: "memory");
}
+static inline void udelay(unsigned long usec)
+{
+ uint64_t start, now, cycles;
+
+ GUEST_ASSERT(guest_tsc_khz);
+ cycles = guest_tsc_khz / 1000 * usec;
+
+ /*
+ * Deliberately don't PAUSE, a.k.a. cpu_relax(), so that the delay is
+ * as accurate as possible, e.g. doesn't trigger PAUSE-Loop VM-Exits.
+ */
+ start = rdtsc();
+ do {
+ now = rdtsc();
+ } while (now - start < cycles);
+}
+
#define ud2() \
__asm__ __volatile__( \
"ud2\n" \
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index ad00e4761886..56b170b725b3 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -21,6 +21,7 @@
uint32_t guest_random_seed;
struct guest_random_state guest_rng;
+static uint32_t last_guest_seed;
static int vcpu_mmap_sz(void);
@@ -434,7 +435,10 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
slot0 = memslot2region(vm, 0);
ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
- pr_info("Random seed: 0x%x\n", guest_random_seed);
+ if (guest_random_seed != last_guest_seed) {
+ pr_info("Random seed: 0x%x\n", guest_random_seed);
+ last_guest_seed = guest_random_seed;
+ }
guest_rng = new_guest_random_state(guest_random_seed);
sync_global_to_guest(vm, guest_rng);
@@ -2319,7 +2323,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
/* Tell stdout not to buffer its content. */
setbuf(stdout, NULL);
- guest_random_seed = random();
+ guest_random_seed = last_guest_seed = random();
+ pr_info("Random seed: 0x%x\n", guest_random_seed);
kvm_selftest_arch_init();
}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 594b061aef52..153739f2e201 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -25,6 +25,7 @@ vm_vaddr_t exception_handlers;
bool host_cpu_is_amd;
bool host_cpu_is_intel;
bool is_forced_emulation_enabled;
+uint64_t guest_tsc_khz;
static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
{
@@ -616,6 +617,11 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
void kvm_arch_vm_post_create(struct kvm_vm *vm)
{
+ int r;
+
+ TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
+ "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
+
vm_create_irqchip(vm);
vm_init_descriptor_tables(vm);
@@ -628,6 +634,11 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm)
vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
}
+
+ r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
+ TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
+ guest_tsc_khz = r;
+ sync_global_to_guest(vm, guest_tsc_khz);
}
void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 05fcf902e067..49f162573126 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -53,12 +53,6 @@ static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
}
}
-struct memslot_antagonist_args {
- struct kvm_vm *vm;
- useconds_t delay;
- uint64_t nr_modifications;
-};
-
static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay,
uint64_t nr_modifications)
{
diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
new file mode 100644
index 000000000000..0350a8896a2f
--- /dev/null
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Intel, Inc
+ *
+ * Author:
+ * Isaku Yamahata <isaku.yamahata at gmail.com>
+ */
+#include <linux/sizes.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+/* Arbitrarily chosen values */
+#define TEST_SIZE (SZ_2M + PAGE_SIZE)
+#define TEST_NPAGES (TEST_SIZE / PAGE_SIZE)
+#define TEST_SLOT 10
+
+static void guest_code(uint64_t base_gpa)
+{
+ volatile uint64_t val __used;
+ int i;
+
+ for (i = 0; i < TEST_NPAGES; i++) {
+ uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE);
+
+ val = *src;
+ }
+
+ GUEST_DONE();
+}
+
+static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 gpa, u64 size,
+ u64 left)
+{
+ struct kvm_pre_fault_memory range = {
+ .gpa = gpa,
+ .size = size,
+ .flags = 0,
+ };
+ u64 prev;
+ int ret, save_errno;
+
+ do {
+ prev = range.size;
+ ret = __vcpu_ioctl(vcpu, KVM_PRE_FAULT_MEMORY, &range);
+ save_errno = errno;
+ TEST_ASSERT((range.size < prev) ^ (ret < 0),
+ "%sexpecting range.size to change on %s",
+ ret < 0 ? "not " : "",
+ ret < 0 ? "failure" : "success");
+ } while (ret >= 0 ? range.size : save_errno == EINTR);
+
+ TEST_ASSERT(range.size == left,
+ "Completed with %lld bytes left, expected %" PRId64,
+ range.size, left);
+
+ if (left == 0)
+ __TEST_ASSERT_VM_VCPU_IOCTL(!ret, "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm);
+ else
+ /* No memory slot causes RET_PF_EMULATE. it results in -ENOENT. */
+ __TEST_ASSERT_VM_VCPU_IOCTL(ret && save_errno == ENOENT,
+ "KVM_PRE_FAULT_MEMORY", ret, vcpu->vm);
+}
+
+static void __test_pre_fault_memory(unsigned long vm_type, bool private)
+{
+ const struct vm_shape shape = {
+ .mode = VM_MODE_DEFAULT,
+ .type = vm_type,
+ };
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct ucall uc;
+
+ uint64_t guest_test_phys_mem;
+ uint64_t guest_test_virt_mem;
+ uint64_t alignment, guest_page_size;
+
+ vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
+
+ alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
+ guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size;
+#ifdef __s390x__
+ alignment = max(0x100000UL, guest_page_size);
+#else
+ alignment = SZ_2M;
+#endif
+ guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
+ guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_test_phys_mem, TEST_SLOT, TEST_NPAGES,
+ private ? KVM_MEM_GUEST_MEMFD : 0);
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES);
+
+ if (private)
+ vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE);
+ pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, 0);
+ pre_fault_memory(vcpu, guest_test_phys_mem + SZ_2M, PAGE_SIZE * 2, PAGE_SIZE);
+ pre_fault_memory(vcpu, guest_test_phys_mem + TEST_SIZE, PAGE_SIZE, PAGE_SIZE);
+
+ vcpu_args_set(vcpu, 1, guest_test_virt_mem);
+ vcpu_run(vcpu);
+
+ run = vcpu->run;
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
+ run->exit_reason, exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_DONE:
+ break;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ break;
+ }
+
+ kvm_vm_free(vm);
+}
+
+static void test_pre_fault_memory(unsigned long vm_type, bool private)
+{
+ if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) {
+ pr_info("Skipping tests for vm_type 0x%lx\n", vm_type);
+ return;
+ }
+
+ __test_pre_fault_memory(vm_type, private);
+}
+
+int main(int argc, char *argv[])
+{
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
+
+ test_pre_fault_memory(0, false);
+#ifdef __x86_64__
+ test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false);
+ test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true);
+#endif
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index f92c2fb23fcd..8e34f7fa44e9 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -961,10 +961,10 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zbkb, ZBKB);
KVM_ISA_EXT_SIMPLE_CONFIG(zbkc, ZBKC);
KVM_ISA_EXT_SIMPLE_CONFIG(zbkx, ZBKX);
KVM_ISA_EXT_SIMPLE_CONFIG(zbs, ZBS);
-KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA),
-KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB),
-KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD),
-KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF),
+KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA);
+KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB);
+KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD);
+KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF);
KVM_ISA_EXT_SIMPLE_CONFIG(zcmop, ZCMOP);
KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA);
KVM_ISA_EXT_SIMPLE_CONFIG(zfh, ZFH);
diff --git a/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
new file mode 100644
index 000000000000..f8916bb34405
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/apic_bus_clock_test.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * Verify KVM correctly emulates the APIC bus frequency when the VMM configures
+ * the frequency via KVM_CAP_X86_APIC_BUS_CYCLES_NS. Start the APIC timer by
+ * programming TMICT (timer initial count) to the largest value possible (so
+ * that the timer will not expire during the test). Then, after an arbitrary
+ * amount of time has elapsed, verify TMCCT (timer current count) is within 1%
+ * of the expected value based on the time elapsed, the APIC bus frequency, and
+ * the programmed TDCR (timer divide configuration register).
+ */
+
+#include "apic.h"
+#include "test_util.h"
+
+/*
+ * Possible TDCR values with matching divide count. Used to modify APIC
+ * timer frequency.
+ */
+static const struct {
+ const uint32_t tdcr;
+ const uint32_t divide_count;
+} tdcrs[] = {
+ {0x0, 2},
+ {0x1, 4},
+ {0x2, 8},
+ {0x3, 16},
+ {0x8, 32},
+ {0x9, 64},
+ {0xa, 128},
+ {0xb, 1},
+};
+
+static bool is_x2apic;
+
+static void apic_enable(void)
+{
+ if (is_x2apic)
+ x2apic_enable();
+ else
+ xapic_enable();
+}
+
+static uint32_t apic_read_reg(unsigned int reg)
+{
+ return is_x2apic ? x2apic_read_reg(reg) : xapic_read_reg(reg);
+}
+
+static void apic_write_reg(unsigned int reg, uint32_t val)
+{
+ if (is_x2apic)
+ x2apic_write_reg(reg, val);
+ else
+ xapic_write_reg(reg, val);
+}
+
+static void apic_guest_code(uint64_t apic_hz, uint64_t delay_ms)
+{
+ uint64_t tsc_hz = guest_tsc_khz * 1000;
+ const uint32_t tmict = ~0u;
+ uint64_t tsc0, tsc1, freq;
+ uint32_t tmcct;
+ int i;
+
+ apic_enable();
+
+ /*
+ * Setup one-shot timer. The vector does not matter because the
+ * interrupt should not fire.
+ */
+ apic_write_reg(APIC_LVTT, APIC_LVT_TIMER_ONESHOT | APIC_LVT_MASKED);
+
+ for (i = 0; i < ARRAY_SIZE(tdcrs); i++) {
+ apic_write_reg(APIC_TDCR, tdcrs[i].tdcr);
+ apic_write_reg(APIC_TMICT, tmict);
+
+ tsc0 = rdtsc();
+ udelay(delay_ms * 1000);
+ tmcct = apic_read_reg(APIC_TMCCT);
+ tsc1 = rdtsc();
+
+ /*
+ * Stop the timer _after_ reading the current, final count, as
+ * writing the initial counter also modifies the current count.
+ */
+ apic_write_reg(APIC_TMICT, 0);
+
+ freq = (tmict - tmcct) * tdcrs[i].divide_count * tsc_hz / (tsc1 - tsc0);
+ /* Check if measured frequency is within 5% of configured frequency. */
+ __GUEST_ASSERT(freq < apic_hz * 105 / 100 && freq > apic_hz * 95 / 100,
+ "Frequency = %lu (wanted %lu - %lu), bus = %lu, div = %u, tsc = %lu",
+ freq, apic_hz * 95 / 100, apic_hz * 105 / 100,
+ apic_hz, tdcrs[i].divide_count, tsc_hz);
+ }
+
+ GUEST_DONE();
+}
+
+static void test_apic_bus_clock(struct kvm_vcpu *vcpu)
+{
+ bool done = false;
+ struct ucall uc;
+
+ while (!done) {
+ vcpu_run(vcpu);
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_DONE:
+ done = true;
+ break;
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ break;
+ }
+ }
+}
+
+static void run_apic_bus_clock_test(uint64_t apic_hz, uint64_t delay_ms,
+ bool x2apic)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ int ret;
+
+ is_x2apic = x2apic;
+
+ vm = vm_create(1);
+
+ sync_global_to_guest(vm, is_x2apic);
+
+ vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+ NSEC_PER_SEC / apic_hz);
+
+ vcpu = vm_vcpu_add(vm, 0, apic_guest_code);
+ vcpu_args_set(vcpu, 2, apic_hz, delay_ms);
+
+ ret = __vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
+ NSEC_PER_SEC / apic_hz);
+ TEST_ASSERT(ret < 0 && errno == EINVAL,
+ "Setting of APIC bus frequency after vCPU is created should fail.");
+
+ if (!is_x2apic)
+ virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+ test_apic_bus_clock(vcpu);
+ kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-d delay] [-f APIC bus freq]\n", name);
+ puts("");
+ printf("-d: Delay (in msec) guest uses to measure APIC bus frequency.\n");
+ printf("-f: The APIC bus frequency (in MHz) to be configured for the guest.\n");
+ puts("");
+}
+
+int main(int argc, char *argv[])
+{
+ /*
+ * Arbitrarilty default to 25MHz for the APIC bus frequency, which is
+ * different enough from the default 1GHz to be interesting.
+ */
+ uint64_t apic_hz = 25 * 1000 * 1000;
+ uint64_t delay_ms = 100;
+ int opt;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS));
+
+ while ((opt = getopt(argc, argv, "d:f:h")) != -1) {
+ switch (opt) {
+ case 'f':
+ apic_hz = atoi_positive("APIC bus frequency", optarg) * 1000 * 1000;
+ break;
+ case 'd':
+ delay_ms = atoi_positive("Delay in milliseconds", optarg);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ exit(KSFT_SKIP);
+ }
+ }
+
+ run_apic_bus_clock_test(apic_hz, delay_ms, false);
+ run_apic_bus_clock_test(apic_hz, delay_ms, true);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
index 3cc4b86832fe..7e2bfb3c3f3b 100644
--- a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
+++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
@@ -26,19 +26,37 @@ int main(int argc, char *argv[])
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail");
+ /* Test BOOT_CPU_ID interaction (MAX_VCPU_ID cannot be lower) */
+ if (kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)) {
+ vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)MAX_VCPU_ID);
+
+ /* Try setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID */
+ ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID - 1);
+ TEST_ASSERT(ret < 0,
+ "Setting KVM_CAP_MAX_VCPU_ID below BOOT_CPU_ID should fail");
+ }
+
/* Set KVM_CAP_MAX_VCPU_ID */
vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID);
-
/* Try to set KVM_CAP_MAX_VCPU_ID again */
ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1);
TEST_ASSERT(ret < 0,
"Setting KVM_CAP_MAX_VCPU_ID multiple times should fail");
- /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/
+ /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap */
ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID);
TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail");
+ /* Create vCPU with bits 63:32 != 0, but an otherwise valid id */
+ ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(1L << 32));
+ TEST_ASSERT(ret < 0, "Creating vCPU with ID[63:32] != 0 should fail");
+
+ /* Create vCPU with id within bounds */
+ ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)0);
+ TEST_ASSERT(ret >= 0, "Creating vCPU with ID 0 should succeed");
+
+ close(ret);
kvm_vm_free(vm);
return 0;
}
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
index 96446134c00b..698cb36989db 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
@@ -7,15 +7,28 @@
#include "pmu.h"
#include "processor.h"
-/* Number of LOOP instructions for the guest measurement payload. */
-#define NUM_BRANCHES 10
+/* Number of iterations of the loop for the guest measurement payload. */
+#define NUM_LOOPS 10
+
+/* Each iteration of the loop retires one branch instruction. */
+#define NUM_BRANCH_INSNS_RETIRED (NUM_LOOPS)
+
+/*
+ * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE,
+ * 1 LOOP.
+ */
+#define NUM_INSNS_PER_LOOP 3
+
/*
* Number of "extra" instructions that will be counted, i.e. the number of
- * instructions that are needed to set up the loop and then disabled the
- * counter. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
+ * instructions that are needed to set up the loop and then disable the
+ * counter. 2 MOV, 2 XOR, 1 WRMSR.
*/
-#define NUM_EXTRA_INSNS 7
-#define NUM_INSNS_RETIRED (NUM_BRANCHES + NUM_EXTRA_INSNS)
+#define NUM_EXTRA_INSNS 5
+
+/* Total number of instructions retired within the measured section. */
+#define NUM_INSNS_RETIRED (NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS)
+
static uint8_t kvm_pmu_version;
static bool kvm_has_perf_caps;
@@ -100,7 +113,7 @@ static void guest_assert_event_count(uint8_t idx,
GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED);
break;
case INTEL_ARCH_BRANCHES_RETIRED_INDEX:
- GUEST_ASSERT_EQ(count, NUM_BRANCHES);
+ GUEST_ASSERT_EQ(count, NUM_BRANCH_INSNS_RETIRED);
break;
case INTEL_ARCH_LLC_REFERENCES_INDEX:
case INTEL_ARCH_LLC_MISSES_INDEX:
@@ -120,7 +133,7 @@ static void guest_assert_event_count(uint8_t idx,
}
sanity_checks:
- __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+ __asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
GUEST_ASSERT_EQ(_rdpmc(pmc), count);
wrmsr(pmc_msr, 0xdead);
@@ -134,8 +147,8 @@ sanity_checks:
* before the end of the sequence.
*
* If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
- * start of the loop to force LLC references and misses, i.e. to allow testing
- * that those events actually count.
+ * CLFUSH{,OPT} instruction on each loop iteration to force LLC references and
+ * misses, i.e. to allow testing that those events actually count.
*
* If forced emulation is enabled (and specified), force emulation on a subset
* of the measured code to verify that KVM correctly emulates instructions and
@@ -145,10 +158,11 @@ sanity_checks:
#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP) \
do { \
__asm__ __volatile__("wrmsr\n\t" \
+ " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t" \
+ "1:\n\t" \
clflush "\n\t" \
"mfence\n\t" \
- "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t" \
- FEP "loop .\n\t" \
+ FEP "loop 1b\n\t" \
FEP "mov %%edi, %%ecx\n\t" \
FEP "xor %%eax, %%eax\n\t" \
FEP "xor %%edx, %%edx\n\t" \
@@ -163,9 +177,9 @@ do { \
wrmsr(pmc_msr, 0); \
\
if (this_cpu_has(X86_FEATURE_CLFLUSHOPT)) \
- GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP); \
+ GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP); \
else if (this_cpu_has(X86_FEATURE_CLFLUSH)) \
- GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP); \
+ GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP); \
else \
GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP); \
\
@@ -500,7 +514,7 @@ static void guest_test_fixed_counters(void)
wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
- __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+ __asm__ __volatile__("loop ." : "+c"((int){NUM_LOOPS}));
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i);
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
index 26b3e7efe5dd..c15513cd74d1 100644
--- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -32,8 +32,8 @@ struct __kvm_pmu_event_filter {
/*
* This event list comprises Intel's known architectural events, plus AMD's
- * "retired branch instructions" for Zen1-Zen3 (and* possibly other AMD CPUs).
- * Note, AMD and Intel use the same encoding for instructions retired.
+ * Branch Instructions Retired for Zen CPUs. Note, AMD and Intel use the
+ * same encoding for Instructions Retired.
*/
kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED);
@@ -353,38 +353,13 @@ static bool use_intel_pmu(void)
kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
}
-static bool is_zen1(uint32_t family, uint32_t model)
-{
- return family == 0x17 && model <= 0x0f;
-}
-
-static bool is_zen2(uint32_t family, uint32_t model)
-{
- return family == 0x17 && model >= 0x30 && model <= 0x3f;
-}
-
-static bool is_zen3(uint32_t family, uint32_t model)
-{
- return family == 0x19 && model <= 0x0f;
-}
-
/*
- * Determining AMD support for a PMU event requires consulting the AMD
- * PPR for the CPU or reference material derived therefrom. The AMD
- * test code herein has been verified to work on Zen1, Zen2, and Zen3.
- *
- * Feel free to add more AMD CPUs that are documented to support event
- * select 0xc2 umask 0 as "retired branch instructions."
+ * On AMD, all Family 17h+ CPUs (Zen and its successors) use event encoding
+ * 0xc2,0 for Branch Instructions Retired.
*/
static bool use_amd_pmu(void)
{
- uint32_t family = kvm_cpu_family();
- uint32_t model = kvm_cpu_model();
-
- return host_cpu_is_amd &&
- (is_zen1(family, model) ||
- is_zen2(family, model) ||
- is_zen3(family, model));
+ return host_cpu_is_amd && kvm_cpu_family() >= 0x17;
}
/*
diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
index d691d86e5bc3..49913784bc82 100644
--- a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
+++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
@@ -33,6 +33,20 @@ static void guest_not_bsp_vcpu(void *arg)
GUEST_DONE();
}
+static void test_set_invalid_bsp(struct kvm_vm *vm)
+{
+ unsigned long max_vcpu_id = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+ int r;
+
+ if (max_vcpu_id) {
+ r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(max_vcpu_id + 1));
+ TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID > MAX should fail");
+ }
+
+ r = __vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(1L << 32));
+ TEST_ASSERT(r == -1 && errno == EINVAL, "BSP with ID[63:32]!=0 should fail");
+}
+
static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg)
{
int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID,
@@ -80,6 +94,8 @@ static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id,
vm = vm_create(nr_vcpus);
+ test_set_invalid_bsp(vm);
+
vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id);
for (i = 0; i < nr_vcpus; i++)
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
index 69849acd95b0..618cd2442390 100644
--- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
@@ -184,6 +184,33 @@ static void test_apic_id(void)
kvm_vm_free(vm);
}
+static void test_x2apic_id(void)
+{
+ struct kvm_lapic_state lapic = {};
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ int i;
+
+ vm = vm_create_with_one_vcpu(&vcpu, NULL);
+ vcpu_set_msr(vcpu, MSR_IA32_APICBASE, MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+
+ /*
+ * Try stuffing a modified x2APIC ID, KVM should ignore the value and
+ * always return the vCPU's default/readonly x2APIC ID.
+ */
+ for (i = 0; i <= 0xff; i++) {
+ *(u32 *)(lapic.regs + APIC_ID) = i << 24;
+ *(u32 *)(lapic.regs + APIC_SPIV) = APIC_SPIV_APIC_ENABLED;
+ vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
+
+ vcpu_ioctl(vcpu, KVM_GET_LAPIC, &lapic);
+ TEST_ASSERT(*((u32 *)&lapic.regs[APIC_ID]) == vcpu->id << 24,
+ "x2APIC ID should be fully readonly");
+ }
+
+ kvm_vm_free(vm);
+}
+
int main(int argc, char *argv[])
{
struct xapic_vcpu x = {
@@ -211,4 +238,5 @@ int main(int argc, char *argv[])
kvm_vm_free(vm);
test_apic_id();
+ test_x2apic_id();
}
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 3c1e9f35b531..3b26bf3cf5b9 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -9,6 +9,7 @@
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
+#include <linux/keyctl.h>
#include <linux/landlock.h>
#include <string.h>
#include <sys/prctl.h>
@@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer)
ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
}
+TEST(cred_transfer)
+{
+ struct landlock_ruleset_attr ruleset_attr = {
+ .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR,
+ };
+ int ruleset_fd, dir_fd;
+ pid_t child;
+ int status;
+
+ drop_caps(_metadata);
+
+ dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+ EXPECT_LE(0, dir_fd);
+ EXPECT_EQ(0, close(dir_fd));
+
+ /* Denies opening directories. */
+ ruleset_fd =
+ landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+ ASSERT_LE(0, ruleset_fd);
+ EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+ ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+ EXPECT_EQ(0, close(ruleset_fd));
+
+ /* Checks ruleset enforcement. */
+ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+ EXPECT_EQ(EACCES, errno);
+
+ /* Needed for KEYCTL_SESSION_TO_PARENT permission checks */
+ EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0,
+ 0, 0))
+ {
+ TH_LOG("Failed to join session keyring: %s", strerror(errno));
+ }
+
+ child = fork();
+ ASSERT_LE(0, child);
+ if (child == 0) {
+ /* Checks ruleset enforcement. */
+ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+ EXPECT_EQ(EACCES, errno);
+
+ /*
+ * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a
+ * different session keyring in the child, so make that happen.
+ */
+ EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING,
+ NULL, 0, 0, 0));
+
+ /*
+ * KEYCTL_SESSION_TO_PARENT installs credentials on the parent
+ * that never go through the cred_prepare hook, this path uses
+ * cred_transfer instead.
+ */
+ EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0,
+ 0, 0, 0));
+
+ /* Re-checks ruleset enforcement. */
+ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+ EXPECT_EQ(EACCES, errno);
+
+ _exit(_metadata->exit_code);
+ return;
+ }
+
+ EXPECT_EQ(child, waitpid(child, &status, 0));
+ EXPECT_EQ(1, WIFEXITED(status));
+ EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
+
+ /* Re-checks ruleset enforcement. */
+ EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+ EXPECT_EQ(EACCES, errno);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config
index 0086efaa7b68..29af19c4e9f9 100644
--- a/tools/testing/selftests/landlock/config
+++ b/tools/testing/selftests/landlock/config
@@ -2,6 +2,7 @@ CONFIG_CGROUPS=y
CONFIG_CGROUP_SCHED=y
CONFIG_INET=y
CONFIG_IPV6=y
+CONFIG_KEYS=y
CONFIG_NET=y
CONFIG_NET_NS=y
CONFIG_OVERLAY_FS=y
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index 7b299ed5ff45..d6edcfcb5be8 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -196,6 +196,9 @@ endef
clean: $(if $(TEST_GEN_MODS_DIR),clean_mods_dir)
$(CLEAN)
+# Build with _GNU_SOURCE by default
+CFLAGS += -D_GNU_SOURCE=
+
# Enables to extend CFLAGS and LDFLAGS from command line, e.g.
# make USERCFLAGS=-Werror USERLDFLAGS=-static
CFLAGS += $(USERCFLAGS)
diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh
index e3455a6b1158..65c9c058458d 100755
--- a/tools/testing/selftests/livepatch/test-livepatch.sh
+++ b/tools/testing/selftests/livepatch/test-livepatch.sh
@@ -4,7 +4,9 @@
. $(dirname $0)/functions.sh
-MOD_LIVEPATCH=test_klp_livepatch
+MOD_LIVEPATCH1=test_klp_livepatch
+MOD_LIVEPATCH2=test_klp_syscall
+MOD_LIVEPATCH3=test_klp_callbacks_demo
MOD_REPLACE=test_klp_atomic_replace
setup_config
@@ -16,33 +18,33 @@ setup_config
start_test "basic function patching"
-load_lp $MOD_LIVEPATCH
+load_lp $MOD_LIVEPATCH1
-if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then
+if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH1: this has been live patched" ]] ; then
echo -e "FAIL\n\n"
die "livepatch kselftest(s) failed"
fi
-disable_lp $MOD_LIVEPATCH
-unload_lp $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH1
+unload_lp $MOD_LIVEPATCH1
-if [[ "$(cat /proc/cmdline)" == "$MOD_LIVEPATCH: this has been live patched" ]] ; then
+if [[ "$(cat /proc/cmdline)" == "$MOD_LIVEPATCH1: this has been live patched" ]] ; then
echo -e "FAIL\n\n"
die "livepatch kselftest(s) failed"
fi
-check_result "% insmod test_modules/$MOD_LIVEPATCH.ko
-livepatch: enabling patch '$MOD_LIVEPATCH'
-livepatch: '$MOD_LIVEPATCH': initializing patching transition
-livepatch: '$MOD_LIVEPATCH': starting patching transition
-livepatch: '$MOD_LIVEPATCH': completing patching transition
-livepatch: '$MOD_LIVEPATCH': patching complete
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
-livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
-livepatch: '$MOD_LIVEPATCH': starting unpatching transition
-livepatch: '$MOD_LIVEPATCH': completing unpatching transition
-livepatch: '$MOD_LIVEPATCH': unpatching complete
-% rmmod $MOD_LIVEPATCH"
+check_result "% insmod test_modules/$MOD_LIVEPATCH1.ko
+livepatch: enabling patch '$MOD_LIVEPATCH1'
+livepatch: '$MOD_LIVEPATCH1': initializing patching transition
+livepatch: '$MOD_LIVEPATCH1': starting patching transition
+livepatch: '$MOD_LIVEPATCH1': completing patching transition
+livepatch: '$MOD_LIVEPATCH1': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH1/enabled
+livepatch: '$MOD_LIVEPATCH1': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH1': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH1': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH1': unpatching complete
+% rmmod $MOD_LIVEPATCH1"
# - load a livepatch that modifies the output from /proc/cmdline and
@@ -53,7 +55,7 @@ livepatch: '$MOD_LIVEPATCH': unpatching complete
start_test "multiple livepatches"
-load_lp $MOD_LIVEPATCH
+load_lp $MOD_LIVEPATCH1
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
@@ -69,26 +71,26 @@ unload_lp $MOD_REPLACE
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
-disable_lp $MOD_LIVEPATCH
-unload_lp $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH1
+unload_lp $MOD_LIVEPATCH1
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
-check_result "% insmod test_modules/$MOD_LIVEPATCH.ko
-livepatch: enabling patch '$MOD_LIVEPATCH'
-livepatch: '$MOD_LIVEPATCH': initializing patching transition
-livepatch: '$MOD_LIVEPATCH': starting patching transition
-livepatch: '$MOD_LIVEPATCH': completing patching transition
-livepatch: '$MOD_LIVEPATCH': patching complete
-$MOD_LIVEPATCH: this has been live patched
+check_result "% insmod test_modules/$MOD_LIVEPATCH1.ko
+livepatch: enabling patch '$MOD_LIVEPATCH1'
+livepatch: '$MOD_LIVEPATCH1': initializing patching transition
+livepatch: '$MOD_LIVEPATCH1': starting patching transition
+livepatch: '$MOD_LIVEPATCH1': completing patching transition
+livepatch: '$MOD_LIVEPATCH1': patching complete
+$MOD_LIVEPATCH1: this has been live patched
% insmod test_modules/$MOD_REPLACE.ko replace=0
livepatch: enabling patch '$MOD_REPLACE'
livepatch: '$MOD_REPLACE': initializing patching transition
livepatch: '$MOD_REPLACE': starting patching transition
livepatch: '$MOD_REPLACE': completing patching transition
livepatch: '$MOD_REPLACE': patching complete
-$MOD_LIVEPATCH: this has been live patched
+$MOD_LIVEPATCH1: this has been live patched
$MOD_REPLACE: this has been live patched
% echo 0 > /sys/kernel/livepatch/$MOD_REPLACE/enabled
livepatch: '$MOD_REPLACE': initializing unpatching transition
@@ -96,35 +98,57 @@ livepatch: '$MOD_REPLACE': starting unpatching transition
livepatch: '$MOD_REPLACE': completing unpatching transition
livepatch: '$MOD_REPLACE': unpatching complete
% rmmod $MOD_REPLACE
-$MOD_LIVEPATCH: this has been live patched
-% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
-livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
-livepatch: '$MOD_LIVEPATCH': starting unpatching transition
-livepatch: '$MOD_LIVEPATCH': completing unpatching transition
-livepatch: '$MOD_LIVEPATCH': unpatching complete
-% rmmod $MOD_LIVEPATCH"
+$MOD_LIVEPATCH1: this has been live patched
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH1/enabled
+livepatch: '$MOD_LIVEPATCH1': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH1': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH1': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH1': unpatching complete
+% rmmod $MOD_LIVEPATCH1"
# - load a livepatch that modifies the output from /proc/cmdline and
# verify correct behavior
-# - load an atomic replace livepatch and verify that only the second is active
-# - remove the first livepatch and verify that the atomic replace livepatch
-# is still active
+# - load two additional livepatches and check the number of livepatch modules
+# applied
+# - load an atomic replace livepatch and check that the other three modules were
+# disabled
+# - remove all livepatches besides the atomic replace one and verify that the
+# atomic replace livepatch is still active
# - remove the atomic replace livepatch and verify that none are active
start_test "atomic replace livepatch"
-load_lp $MOD_LIVEPATCH
+load_lp $MOD_LIVEPATCH1
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
+for mod in $MOD_LIVEPATCH2 $MOD_LIVEPATCH3; do
+ load_lp "$mod"
+done
+
+mods=(/sys/kernel/livepatch/*)
+nmods=${#mods[@]}
+if [ "$nmods" -ne 3 ]; then
+ die "Expecting three modules listed, found $nmods"
+fi
+
load_lp $MOD_REPLACE replace=1
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
-unload_lp $MOD_LIVEPATCH
+mods=(/sys/kernel/livepatch/*)
+nmods=${#mods[@]}
+if [ "$nmods" -ne 1 ]; then
+ die "Expecting only one moduled listed, found $nmods"
+fi
+
+# These modules were disabled by the atomic replace
+for mod in $MOD_LIVEPATCH3 $MOD_LIVEPATCH2 $MOD_LIVEPATCH1; do
+ unload_lp "$mod"
+done
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
@@ -135,13 +159,27 @@ unload_lp $MOD_REPLACE
grep 'live patched' /proc/cmdline > /dev/kmsg
grep 'live patched' /proc/meminfo > /dev/kmsg
-check_result "% insmod test_modules/$MOD_LIVEPATCH.ko
-livepatch: enabling patch '$MOD_LIVEPATCH'
-livepatch: '$MOD_LIVEPATCH': initializing patching transition
-livepatch: '$MOD_LIVEPATCH': starting patching transition
-livepatch: '$MOD_LIVEPATCH': completing patching transition
-livepatch: '$MOD_LIVEPATCH': patching complete
-$MOD_LIVEPATCH: this has been live patched
+check_result "% insmod test_modules/$MOD_LIVEPATCH1.ko
+livepatch: enabling patch '$MOD_LIVEPATCH1'
+livepatch: '$MOD_LIVEPATCH1': initializing patching transition
+livepatch: '$MOD_LIVEPATCH1': starting patching transition
+livepatch: '$MOD_LIVEPATCH1': completing patching transition
+livepatch: '$MOD_LIVEPATCH1': patching complete
+$MOD_LIVEPATCH1: this has been live patched
+% insmod test_modules/$MOD_LIVEPATCH2.ko
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% insmod test_modules/$MOD_LIVEPATCH3.ko
+livepatch: enabling patch '$MOD_LIVEPATCH3'
+livepatch: '$MOD_LIVEPATCH3': initializing patching transition
+$MOD_LIVEPATCH3: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH3': starting patching transition
+livepatch: '$MOD_LIVEPATCH3': completing patching transition
+$MOD_LIVEPATCH3: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH3': patching complete
% insmod test_modules/$MOD_REPLACE.ko replace=1
livepatch: enabling patch '$MOD_REPLACE'
livepatch: '$MOD_REPLACE': initializing patching transition
@@ -149,7 +187,9 @@ livepatch: '$MOD_REPLACE': starting patching transition
livepatch: '$MOD_REPLACE': completing patching transition
livepatch: '$MOD_REPLACE': patching complete
$MOD_REPLACE: this has been live patched
-% rmmod $MOD_LIVEPATCH
+% rmmod $MOD_LIVEPATCH3
+% rmmod $MOD_LIVEPATCH2
+% rmmod $MOD_LIVEPATCH1
$MOD_REPLACE: this has been live patched
% echo 0 > /sys/kernel/livepatch/$MOD_REPLACE/enabled
livepatch: '$MOD_REPLACE': initializing unpatching transition
diff --git a/tools/testing/selftests/livepatch/test-syscall.sh b/tools/testing/selftests/livepatch/test-syscall.sh
index b76a881d4013..289eb7d4c4b3 100755
--- a/tools/testing/selftests/livepatch/test-syscall.sh
+++ b/tools/testing/selftests/livepatch/test-syscall.sh
@@ -15,7 +15,10 @@ setup_config
start_test "patch getpid syscall while being heavily hammered"
-for i in $(seq 1 $(getconf _NPROCESSORS_ONLN)); do
+NPROC=$(getconf _NPROCESSORS_ONLN)
+MAXPROC=128
+
+for i in $(seq 1 $(($NPROC < $MAXPROC ? $NPROC : $MAXPROC))); do
./test_klp-call_getpid &
pids[$i]="$!"
done
diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh
index 6c646afa7395..05a14f5a7bfb 100755
--- a/tools/testing/selftests/livepatch/test-sysfs.sh
+++ b/tools/testing/selftests/livepatch/test-sysfs.sh
@@ -18,6 +18,7 @@ check_sysfs_rights "$MOD_LIVEPATCH" "" "drwxr-xr-x"
check_sysfs_rights "$MOD_LIVEPATCH" "enabled" "-rw-r--r--"
check_sysfs_value "$MOD_LIVEPATCH" "enabled" "1"
check_sysfs_rights "$MOD_LIVEPATCH" "force" "--w-------"
+check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--"
check_sysfs_rights "$MOD_LIVEPATCH" "transition" "-r--r--r--"
check_sysfs_value "$MOD_LIVEPATCH" "transition" "0"
check_sysfs_rights "$MOD_LIVEPATCH" "vmlinux/patched" "-r--r--r--"
@@ -83,4 +84,51 @@ test_klp_callbacks_demo: post_unpatch_callback: vmlinux
livepatch: 'test_klp_callbacks_demo': unpatching complete
% rmmod test_klp_callbacks_demo"
+start_test "sysfs test replace enabled"
+
+MOD_LIVEPATCH=test_klp_atomic_replace
+load_lp $MOD_LIVEPATCH replace=1
+
+check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--"
+check_sysfs_value "$MOD_LIVEPATCH" "replace" "1"
+
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% insmod test_modules/$MOD_LIVEPATCH.ko replace=1
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+start_test "sysfs test replace disabled"
+
+load_lp $MOD_LIVEPATCH replace=0
+
+check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--"
+check_sysfs_value "$MOD_LIVEPATCH" "replace" "0"
+
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% insmod test_modules/$MOD_LIVEPATCH.ko replace=0
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
exit 0
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 0b9ab987601c..da030b43e43b 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -6,6 +6,7 @@ hugepage-shm
hugepage-vmemmap
hugetlb-madvise
hugetlb-read-hwpoison
+hugetlb-soft-offline
khugepaged
map_hugetlb
map_populate
@@ -49,3 +50,4 @@ hugetlb_fault_after_madv
hugetlb_madv_vs_map
mseal_test
seal_elf
+droppable
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 3b49bc3d0a3b..cfad627e8d94 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -2,6 +2,7 @@
# Makefile for mm selftests
LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
+LOCAL_HDRS += $(selfdir)/mm/mseal_helpers.h
include local_config.mk
@@ -42,6 +43,7 @@ TEST_GEN_FILES += gup_test
TEST_GEN_FILES += hmm-tests
TEST_GEN_FILES += hugetlb-madvise
TEST_GEN_FILES += hugetlb-read-hwpoison
+TEST_GEN_FILES += hugetlb-soft-offline
TEST_GEN_FILES += hugepage-mmap
TEST_GEN_FILES += hugepage-mremap
TEST_GEN_FILES += hugepage-shm
@@ -51,7 +53,9 @@ TEST_GEN_FILES += madv_populate
TEST_GEN_FILES += map_fixed_noreplace
TEST_GEN_FILES += map_hugetlb
TEST_GEN_FILES += map_populate
+ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64))
TEST_GEN_FILES += memfd_secret
+endif
TEST_GEN_FILES += migration
TEST_GEN_FILES += mkdirty
TEST_GEN_FILES += mlock-random-test
@@ -73,6 +77,8 @@ TEST_GEN_FILES += ksm_functional_tests
TEST_GEN_FILES += mdwe_test
TEST_GEN_FILES += hugetlb_fault_after_madv
TEST_GEN_FILES += hugetlb_madv_vs_map
+TEST_GEN_FILES += hugetlb_dio
+TEST_GEN_FILES += droppable
ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty
@@ -106,7 +112,7 @@ endif
endif
-ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64))
+ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
TEST_GEN_FILES += va_high_addr_switch
TEST_GEN_FILES += virtual_address_range
TEST_GEN_FILES += write_to_hugetlbfs
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index e140558e6f53..2c3a0eb6b22d 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -89,9 +89,10 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size,
int fd, ret = -1;
int compaction_index = 0;
char nr_hugepages[20] = {0};
- char init_nr_hugepages[20] = {0};
+ char init_nr_hugepages[24] = {0};
- sprintf(init_nr_hugepages, "%lu", initial_nr_hugepages);
+ snprintf(init_nr_hugepages, sizeof(init_nr_hugepages),
+ "%lu", initial_nr_hugepages);
/* We want to test with 80% of available memory. Else, OOM killer comes
in to play */
diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c
new file mode 100644
index 000000000000..f3d9ecf96890
--- /dev/null
+++ b/tools/testing/selftests/mm/droppable.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <linux/mman.h>
+
+#include "../kselftest.h"
+
+int main(int argc, char *argv[])
+{
+ size_t alloc_size = 134217728;
+ size_t page_size = getpagesize();
+ void *alloc;
+ pid_t child;
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
+ assert(alloc != MAP_FAILED);
+ memset(alloc, 'A', alloc_size);
+ for (size_t i = 0; i < alloc_size; i += page_size)
+ assert(*(uint8_t *)(alloc + i));
+
+ child = fork();
+ assert(child >= 0);
+ if (!child) {
+ for (;;)
+ *(char *)malloc(page_size) = 'B';
+ }
+
+ for (bool done = false; !done;) {
+ for (size_t i = 0; i < alloc_size; i += page_size) {
+ if (!*(uint8_t *)(alloc + i)) {
+ done = true;
+ break;
+ }
+ }
+ }
+ kill(child, SIGTERM);
+
+ ksft_test_result_pass("MAP_DROPPABLE: PASS\n");
+ exit(KSFT_PASS);
+}
diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
index c463d1c09c9b..ada9156cc497 100644
--- a/tools/testing/selftests/mm/hugepage-mremap.c
+++ b/tools/testing/selftests/mm/hugepage-mremap.c
@@ -15,7 +15,7 @@
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <fcntl.h> /* Definition of O_* constants */
diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c
new file mode 100644
index 000000000000..f086f0e04756
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test soft offline behavior for HugeTLB pages:
+ * - if enable_soft_offline = 0, hugepages should stay intact and soft
+ * offlining failed with EOPNOTSUPP.
+ * - if enable_soft_offline = 1, a hugepage should be dissolved and
+ * nr_hugepages/free_hugepages should be reduced by 1.
+ *
+ * Before running, make sure more than 2 hugepages of default_hugepagesz
+ * are allocated. For example, if /proc/meminfo/Hugepagesize is 2048kB:
+ * echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/magic.h>
+#include <linux/memfd.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+
+#include "../kselftest.h"
+
+#ifndef MADV_SOFT_OFFLINE
+#define MADV_SOFT_OFFLINE 101
+#endif
+
+#define EPREFIX " !!! "
+
+static int do_soft_offline(int fd, size_t len, int expect_errno)
+{
+ char *filemap = NULL;
+ char *hwp_addr = NULL;
+ const unsigned long pagesize = getpagesize();
+ int ret = 0;
+
+ if (ftruncate(fd, len) < 0) {
+ ksft_perror(EPREFIX "ftruncate to len failed");
+ return -1;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ ksft_perror(EPREFIX "mmap failed");
+ ret = -1;
+ goto untruncate;
+ }
+
+ memset(filemap, 0xab, len);
+ ksft_print_msg("Allocated %#lx bytes of hugetlb pages\n", len);
+
+ hwp_addr = filemap + len / 2;
+ ret = madvise(hwp_addr, pagesize, MADV_SOFT_OFFLINE);
+ ksft_print_msg("MADV_SOFT_OFFLINE %p ret=%d, errno=%d\n",
+ hwp_addr, ret, errno);
+ if (ret != 0)
+ ksft_perror(EPREFIX "madvise failed");
+
+ if (errno == expect_errno)
+ ret = 0;
+ else {
+ ksft_print_msg("MADV_SOFT_OFFLINE should ret %d\n",
+ expect_errno);
+ ret = -1;
+ }
+
+ munmap(filemap, len);
+untruncate:
+ if (ftruncate(fd, 0) < 0)
+ ksft_perror(EPREFIX "ftruncate back to 0 failed");
+
+ return ret;
+}
+
+static int set_enable_soft_offline(int value)
+{
+ char cmd[256] = {0};
+ FILE *cmdfile = NULL;
+
+ if (value != 0 && value != 1)
+ return -EINVAL;
+
+ sprintf(cmd, "echo %d > /proc/sys/vm/enable_soft_offline", value);
+ cmdfile = popen(cmd, "r");
+
+ if (cmdfile)
+ ksft_print_msg("enable_soft_offline => %d\n", value);
+ else {
+ ksft_perror(EPREFIX "failed to set enable_soft_offline");
+ return errno;
+ }
+
+ pclose(cmdfile);
+ return 0;
+}
+
+static int read_nr_hugepages(unsigned long hugepage_size,
+ unsigned long *nr_hugepages)
+{
+ char buffer[256] = {0};
+ char cmd[256] = {0};
+
+ sprintf(cmd, "cat /sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages",
+ hugepage_size);
+ FILE *cmdfile = popen(cmd, "r");
+
+ if (cmdfile == NULL) {
+ ksft_perror(EPREFIX "failed to popen nr_hugepages");
+ return -1;
+ }
+
+ if (!fgets(buffer, sizeof(buffer), cmdfile)) {
+ ksft_perror(EPREFIX "failed to read nr_hugepages");
+ pclose(cmdfile);
+ return -1;
+ }
+
+ *nr_hugepages = atoll(buffer);
+ pclose(cmdfile);
+ return 0;
+}
+
+static int create_hugetlbfs_file(struct statfs *file_stat)
+{
+ int fd;
+
+ fd = memfd_create("hugetlb_tmp", MFD_HUGETLB);
+ if (fd < 0) {
+ ksft_perror(EPREFIX "could not open hugetlbfs file");
+ return -1;
+ }
+
+ memset(file_stat, 0, sizeof(*file_stat));
+ if (fstatfs(fd, file_stat)) {
+ ksft_perror(EPREFIX "fstatfs failed");
+ goto close;
+ }
+ if (file_stat->f_type != HUGETLBFS_MAGIC) {
+ ksft_print_msg(EPREFIX "not hugetlbfs file\n");
+ goto close;
+ }
+
+ return fd;
+close:
+ close(fd);
+ return -1;
+}
+
+static void test_soft_offline_common(int enable_soft_offline)
+{
+ int fd;
+ int expect_errno = enable_soft_offline ? 0 : EOPNOTSUPP;
+ struct statfs file_stat;
+ unsigned long hugepagesize_kb = 0;
+ unsigned long nr_hugepages_before = 0;
+ unsigned long nr_hugepages_after = 0;
+ int ret;
+
+ ksft_print_msg("Test soft-offline when enabled_soft_offline=%d\n",
+ enable_soft_offline);
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ ksft_exit_fail_msg("Failed to create hugetlbfs file\n");
+
+ hugepagesize_kb = file_stat.f_bsize / 1024;
+ ksft_print_msg("Hugepagesize is %ldkB\n", hugepagesize_kb);
+
+ if (set_enable_soft_offline(enable_soft_offline) != 0) {
+ close(fd);
+ ksft_exit_fail_msg("Failed to set enable_soft_offline\n");
+ }
+
+ if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_before) != 0) {
+ close(fd);
+ ksft_exit_fail_msg("Failed to read nr_hugepages\n");
+ }
+
+ ksft_print_msg("Before MADV_SOFT_OFFLINE nr_hugepages=%ld\n",
+ nr_hugepages_before);
+
+ ret = do_soft_offline(fd, 2 * file_stat.f_bsize, expect_errno);
+
+ if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_after) != 0) {
+ close(fd);
+ ksft_exit_fail_msg("Failed to read nr_hugepages\n");
+ }
+
+ ksft_print_msg("After MADV_SOFT_OFFLINE nr_hugepages=%ld\n",
+ nr_hugepages_after);
+
+ // No need for the hugetlbfs file from now on.
+ close(fd);
+
+ if (enable_soft_offline) {
+ if (nr_hugepages_before != nr_hugepages_after + 1) {
+ ksft_test_result_fail("MADV_SOFT_OFFLINE should reduced 1 hugepage\n");
+ return;
+ }
+ } else {
+ if (nr_hugepages_before != nr_hugepages_after) {
+ ksft_test_result_fail("MADV_SOFT_OFFLINE reduced %lu hugepages\n",
+ nr_hugepages_before - nr_hugepages_after);
+ return;
+ }
+ }
+
+ ksft_test_result(ret == 0,
+ "Test soft-offline when enabled_soft_offline=%d\n",
+ enable_soft_offline);
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(2);
+
+ test_soft_offline_common(1);
+ test_soft_offline_common(0);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
new file mode 100644
index 000000000000..f9ac20c657ec
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program tests for hugepage leaks after DIO writes to a file using a
+ * hugepage as the user buffer. During DIO, the user buffer is pinned and
+ * should be properly unpinned upon completion. This patch verifies that the
+ * kernel correctly unpins the buffer at DIO completion for both aligned and
+ * unaligned user buffer offsets (w.r.t page boundary), ensuring the hugepage
+ * is freed upon unmapping.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "vm_util.h"
+#include "../kselftest.h"
+
+void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
+{
+ int fd;
+ char *buffer = NULL;
+ char *orig_buffer = NULL;
+ size_t h_pagesize = 0;
+ size_t writesize;
+ int free_hpage_b = 0;
+ int free_hpage_a = 0;
+ const int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+ const int mmap_prot = PROT_READ | PROT_WRITE;
+
+ writesize = end_off - start_off;
+
+ /* Get the default huge page size */
+ h_pagesize = default_huge_page_size();
+ if (!h_pagesize)
+ ksft_exit_fail_msg("Unable to determine huge page size\n");
+
+ /* Open the file to DIO */
+ fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664);
+ if (fd < 0)
+ ksft_exit_fail_perror("Error opening file\n");
+
+ /* Get the free huge pages before allocation */
+ free_hpage_b = get_free_hugepages();
+ if (free_hpage_b == 0) {
+ close(fd);
+ ksft_exit_skip("No free hugepage, exiting!\n");
+ }
+
+ /* Allocate a hugetlb page */
+ orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
+ if (orig_buffer == MAP_FAILED) {
+ close(fd);
+ ksft_exit_fail_perror("Error mapping memory\n");
+ }
+ buffer = orig_buffer;
+ buffer += start_off;
+
+ memset(buffer, 'A', writesize);
+
+ /* Write the buffer to the file */
+ if (write(fd, buffer, writesize) != (writesize)) {
+ munmap(orig_buffer, h_pagesize);
+ close(fd);
+ ksft_exit_fail_perror("Error writing to file\n");
+ }
+
+ /* unmap the huge page */
+ munmap(orig_buffer, h_pagesize);
+ close(fd);
+
+ /* Get the free huge pages after unmap*/
+ free_hpage_a = get_free_hugepages();
+
+ /*
+ * If the no. of free hugepages before allocation and after unmap does
+ * not match - that means there could still be a page which is pinned.
+ */
+ if (free_hpage_a != free_hpage_b) {
+ ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b);
+ ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a);
+ ksft_test_result_fail(": Huge pages not freed!\n");
+ } else {
+ ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b);
+ ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a);
+ ksft_test_result_pass(": Huge pages freed successfully !\n");
+ }
+}
+
+int main(void)
+{
+ size_t pagesize = 0;
+
+ ksft_print_header();
+ ksft_set_plan(4);
+
+ /* Get base page size */
+ pagesize = psize();
+
+ /* start and end is aligned to pagesize */
+ run_dio_using_hugetlb(0, (pagesize * 3));
+
+ /* start is aligned but end is not aligned */
+ run_dio_using_hugetlb(0, (pagesize * 3) - (pagesize / 2));
+
+ /* start is unaligned and end is aligned */
+ run_dio_using_hugetlb(pagesize / 2, (pagesize * 3));
+
+ /* both start and end are unaligned */
+ run_dio_using_hugetlb(pagesize / 2, (pagesize * 3) + (pagesize / 2));
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index b61803e36d1c..66b4e111b5a2 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -11,7 +11,7 @@
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
@@ -369,7 +369,6 @@ unmap:
munmap(map, size);
}
-#ifdef __NR_userfaultfd
static void test_unmerge_uffd_wp(void)
{
struct uffdio_writeprotect uffd_writeprotect;
@@ -430,7 +429,6 @@ close_uffd:
unmap:
munmap(map, size);
}
-#endif
/* Verify that KSM can be enabled / queried with prctl. */
static void test_prctl(void)
@@ -686,9 +684,7 @@ int main(int argc, char **argv)
exit(test_child_ksm());
}
-#ifdef __NR_userfaultfd
tests++;
-#endif
ksft_print_header();
ksft_set_plan(tests);
@@ -700,9 +696,7 @@ int main(int argc, char **argv)
test_unmerge();
test_unmerge_zero_pages();
test_unmerge_discarded();
-#ifdef __NR_userfaultfd
test_unmerge_uffd_wp();
-#endif
test_prot_none();
diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c
index 9a0597310a76..74c911aa3aea 100644
--- a/tools/testing/selftests/mm/memfd_secret.c
+++ b/tools/testing/selftests/mm/memfd_secret.c
@@ -17,7 +17,7 @@
#include <stdlib.h>
#include <string.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
@@ -28,8 +28,6 @@
#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
-#ifdef __NR_memfd_secret
-
#define PATTERN 0x55
static const int prot = PROT_READ | PROT_WRITE;
@@ -334,13 +332,3 @@ int main(int argc, char *argv[])
ksft_finished();
}
-
-#else /* __NR_memfd_secret */
-
-int main(int argc, char *argv[])
-{
- printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
- return KSFT_SKIP;
-}
-
-#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c
index b8a7efe9204e..1db134063c38 100644
--- a/tools/testing/selftests/mm/mkdirty.c
+++ b/tools/testing/selftests/mm/mkdirty.c
@@ -9,7 +9,7 @@
*/
#include <fcntl.h>
#include <signal.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
@@ -265,7 +265,6 @@ munmap:
munmap(mmap_mem, mmap_size);
}
-#ifdef __NR_userfaultfd
static void test_uffdio_copy(void)
{
struct uffdio_register uffdio_register;
@@ -322,7 +321,6 @@ munmap:
munmap(dst, pagesize);
free(src);
}
-#endif /* __NR_userfaultfd */
int main(void)
{
@@ -335,9 +333,7 @@ int main(void)
thpsize / 1024);
tests += 3;
}
-#ifdef __NR_userfaultfd
tests += 1;
-#endif /* __NR_userfaultfd */
ksft_print_header();
ksft_set_plan(tests);
@@ -367,9 +363,7 @@ int main(void)
if (thpsize)
test_pte_mapped_thp();
/* Placing a fresh page via userfaultfd may set the PTE dirty. */
-#ifdef __NR_userfaultfd
test_uffdio_copy();
-#endif /* __NR_userfaultfd */
err = ksft_get_fail_cnt();
if (err)
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
index 4417eaa5cfb7..1e5731bab499 100644
--- a/tools/testing/selftests/mm/mlock2.h
+++ b/tools/testing/selftests/mm/mlock2.h
@@ -3,6 +3,7 @@
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
+#include <asm-generic/unistd.h>
static int mlock2_(void *start, size_t len, int flags)
{
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 1b03bcfaefdf..5a3a9bcba640 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -22,8 +22,10 @@
#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */
#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */
+#ifndef MIN
#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+#endif
#define SIZE_MB(m) ((size_t)m * (1024 * 1024))
#define SIZE_KB(k) ((size_t)k * 1024)
diff --git a/tools/testing/selftests/mm/mseal_helpers.h b/tools/testing/selftests/mm/mseal_helpers.h
new file mode 100644
index 000000000000..0cfce31c76d2
--- /dev/null
+++ b/tools/testing/selftests/mm/mseal_helpers.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define FAIL_TEST_IF_FALSE(test_passed) \
+ do { \
+ if (!(test_passed)) { \
+ ksft_test_result_fail("%s: line:%d\n", \
+ __func__, __LINE__); \
+ return; \
+ } \
+ } while (0)
+
+#define SKIP_TEST_IF_FALSE(test_passed) \
+ do { \
+ if (!(test_passed)) { \
+ ksft_test_result_skip("%s: line:%d\n", \
+ __func__, __LINE__); \
+ return; \
+ } \
+ } while (0)
+
+#define REPORT_TEST_PASS() ksft_test_result_pass("%s\n", __func__)
+
+#ifndef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+#endif
+
+#ifndef PKEY_BITS_PER_PKEY
+#define PKEY_BITS_PER_PKEY 2
+#endif
+
+#ifndef PKEY_MASK
+#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+#endif
+
+#ifndef u64
+#define u64 unsigned long long
+#endif
diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
index 41998cf1dcf5..a818f010de47 100644
--- a/tools/testing/selftests/mm/mseal_test.c
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -3,7 +3,7 @@
#include <linux/mman.h>
#include <sys/mman.h>
#include <stdint.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <string.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -17,54 +17,7 @@
#include <sys/ioctl.h>
#include <sys/vfs.h>
#include <sys/stat.h>
-
-/*
- * need those definition for manually build using gcc.
- * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 mseal_test.c -o mseal_test
- */
-#ifndef PKEY_DISABLE_ACCESS
-# define PKEY_DISABLE_ACCESS 0x1
-#endif
-
-#ifndef PKEY_DISABLE_WRITE
-# define PKEY_DISABLE_WRITE 0x2
-#endif
-
-#ifndef PKEY_BITS_PER_PKEY
-#define PKEY_BITS_PER_PKEY 2
-#endif
-
-#ifndef PKEY_MASK
-#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
-#endif
-
-#define FAIL_TEST_IF_FALSE(c) do {\
- if (!(c)) {\
- ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\
- goto test_end;\
- } \
- } \
- while (0)
-
-#define SKIP_TEST_IF_FALSE(c) do {\
- if (!(c)) {\
- ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\
- goto test_end;\
- } \
- } \
- while (0)
-
-
-#define TEST_END_CHECK() {\
- ksft_test_result_pass("%s\n", __func__);\
- return;\
-test_end:\
- return;\
-}
-
-#ifndef u64
-#define u64 unsigned long long
-#endif
+#include "mseal_helpers.h"
static unsigned long get_vma_size(void *addr, int *prot)
{
@@ -287,7 +240,7 @@ static void test_seal_addseal(void)
ret = sys_mseal(ptr, size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_unmapped_start(void)
@@ -315,7 +268,7 @@ static void test_seal_unmapped_start(void)
ret = sys_mseal(ptr + 2 * page_size, 2 * page_size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_unmapped_middle(void)
@@ -347,7 +300,7 @@ static void test_seal_unmapped_middle(void)
ret = sys_mseal(ptr + 3 * page_size, page_size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_unmapped_end(void)
@@ -376,7 +329,7 @@ static void test_seal_unmapped_end(void)
ret = sys_mseal(ptr, 2 * page_size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_multiple_vmas(void)
@@ -407,7 +360,7 @@ static void test_seal_multiple_vmas(void)
ret = sys_mseal(ptr, size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_split_start(void)
@@ -432,7 +385,7 @@ static void test_seal_split_start(void)
ret = sys_mseal(ptr + page_size, 3 * page_size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_split_end(void)
@@ -457,7 +410,7 @@ static void test_seal_split_end(void)
ret = sys_mseal(ptr, 3 * page_size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_invalid_input(void)
@@ -492,7 +445,7 @@ static void test_seal_invalid_input(void)
ret = sys_mseal(ptr - page_size, 5 * page_size);
FAIL_TEST_IF_FALSE(ret < 0);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_zero_length(void)
@@ -516,7 +469,7 @@ static void test_seal_zero_length(void)
ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_zero_address(void)
@@ -542,7 +495,7 @@ static void test_seal_zero_address(void)
ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
FAIL_TEST_IF_FALSE(ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_twice(void)
@@ -562,7 +515,7 @@ static void test_seal_twice(void)
ret = sys_mseal(ptr, size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect(bool seal)
@@ -586,7 +539,7 @@ static void test_seal_mprotect(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_start_mprotect(bool seal)
@@ -616,7 +569,7 @@ static void test_seal_start_mprotect(bool seal)
PROT_READ | PROT_WRITE);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_end_mprotect(bool seal)
@@ -646,7 +599,7 @@ static void test_seal_end_mprotect(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_unalign_len(bool seal)
@@ -675,7 +628,7 @@ static void test_seal_mprotect_unalign_len(bool seal)
PROT_READ | PROT_WRITE);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_unalign_len_variant_2(bool seal)
@@ -703,7 +656,7 @@ static void test_seal_mprotect_unalign_len_variant_2(bool seal)
PROT_READ | PROT_WRITE);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_two_vma(bool seal)
@@ -738,7 +691,7 @@ static void test_seal_mprotect_two_vma(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_two_vma_with_split(bool seal)
@@ -785,7 +738,7 @@ static void test_seal_mprotect_two_vma_with_split(bool seal)
PROT_READ | PROT_WRITE);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_partial_mprotect(bool seal)
@@ -811,7 +764,7 @@ static void test_seal_mprotect_partial_mprotect(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_two_vma_with_gap(bool seal)
@@ -854,7 +807,7 @@ static void test_seal_mprotect_two_vma_with_gap(bool seal)
ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ);
FAIL_TEST_IF_FALSE(ret == 0);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_split(bool seal)
@@ -891,7 +844,7 @@ static void test_seal_mprotect_split(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mprotect_merge(bool seal)
@@ -925,7 +878,7 @@ static void test_seal_mprotect_merge(bool seal)
ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ);
FAIL_TEST_IF_FALSE(ret == 0);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_munmap(bool seal)
@@ -950,7 +903,7 @@ static void test_seal_munmap(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
/*
@@ -990,7 +943,7 @@ static void test_seal_munmap_two_vma(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
/*
@@ -1028,7 +981,7 @@ static void test_seal_munmap_vma_with_gap(bool seal)
ret = sys_munmap(ptr, size);
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_munmap_start_freed(bool seal)
@@ -1068,7 +1021,7 @@ static void test_munmap_start_freed(bool seal)
FAIL_TEST_IF_FALSE(size == 0);
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_munmap_end_freed(bool seal)
@@ -1098,7 +1051,7 @@ static void test_munmap_end_freed(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_munmap_middle_freed(bool seal)
@@ -1142,7 +1095,7 @@ static void test_munmap_middle_freed(bool seal)
FAIL_TEST_IF_FALSE(size == 0);
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_shrink(bool seal)
@@ -1171,7 +1124,7 @@ static void test_seal_mremap_shrink(bool seal)
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_expand(bool seal)
@@ -1203,7 +1156,7 @@ static void test_seal_mremap_expand(bool seal)
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_move(bool seal)
@@ -1236,7 +1189,7 @@ static void test_seal_mremap_move(bool seal)
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mmap_overwrite_prot(bool seal)
@@ -1264,7 +1217,7 @@ static void test_seal_mmap_overwrite_prot(bool seal)
} else
FAIL_TEST_IF_FALSE(ret2 == ptr);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mmap_expand(bool seal)
@@ -1295,7 +1248,7 @@ static void test_seal_mmap_expand(bool seal)
} else
FAIL_TEST_IF_FALSE(ret2 == ptr);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mmap_shrink(bool seal)
@@ -1323,7 +1276,7 @@ static void test_seal_mmap_shrink(bool seal)
} else
FAIL_TEST_IF_FALSE(ret2 == ptr);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_shrink_fixed(bool seal)
@@ -1354,7 +1307,7 @@ static void test_seal_mremap_shrink_fixed(bool seal)
} else
FAIL_TEST_IF_FALSE(ret2 == newAddr);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_expand_fixed(bool seal)
@@ -1385,7 +1338,7 @@ static void test_seal_mremap_expand_fixed(bool seal)
} else
FAIL_TEST_IF_FALSE(ret2 == newAddr);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_move_fixed(bool seal)
@@ -1415,7 +1368,7 @@ static void test_seal_mremap_move_fixed(bool seal)
} else
FAIL_TEST_IF_FALSE(ret2 == newAddr);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_move_fixed_zero(bool seal)
@@ -1447,7 +1400,7 @@ static void test_seal_mremap_move_fixed_zero(bool seal)
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_move_dontunmap(bool seal)
@@ -1476,7 +1429,7 @@ static void test_seal_mremap_move_dontunmap(bool seal)
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_mremap_move_dontunmap_anyaddr(bool seal)
@@ -1510,7 +1463,7 @@ static void test_seal_mremap_move_dontunmap_anyaddr(bool seal)
}
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
@@ -1603,7 +1556,7 @@ static void test_seal_merge_and_split(void)
FAIL_TEST_IF_FALSE(size == 22 * page_size);
FAIL_TEST_IF_FALSE(prot == 0x4);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_discard_ro_anon_on_rw(bool seal)
@@ -1632,7 +1585,7 @@ static void test_seal_discard_ro_anon_on_rw(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_discard_ro_anon_on_pkey(bool seal)
@@ -1679,7 +1632,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_discard_ro_anon_on_filebacked(bool seal)
@@ -1716,7 +1669,7 @@ static void test_seal_discard_ro_anon_on_filebacked(bool seal)
FAIL_TEST_IF_FALSE(!ret);
close(fd);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_discard_ro_anon_on_shared(bool seal)
@@ -1745,7 +1698,7 @@ static void test_seal_discard_ro_anon_on_shared(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
static void test_seal_discard_ro_anon(bool seal)
@@ -1775,7 +1728,7 @@ static void test_seal_discard_ro_anon(bool seal)
else
FAIL_TEST_IF_FALSE(!ret);
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
int main(int argc, char **argv)
diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index 2d785aca72a5..fc90af2a97b8 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -15,7 +15,7 @@
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <math.h>
-#include <asm/unistd.h>
+#include <asm-generic/unistd.h>
#include <pthread.h>
#include <sys/resource.h>
#include <assert.h>
@@ -1567,8 +1567,10 @@ int main(int argc, char *argv[])
/* 7. File Hugetlb testing */
mem_size = 2*1024*1024;
fd = memfd_create("uffd-test", MFD_HUGETLB | MFD_NOEXEC_SEAL);
+ if (fd < 0)
+ ksft_exit_fail_msg("uffd-test creation failed %d %s\n", errno, strerror(errno));
mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- if (mem) {
+ if (mem != MAP_FAILED) {
wp_init(mem, mem_size);
wp_addr_range(mem, mem_size);
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
index 48dc151f8fca..eaa6d1fc5328 100644
--- a/tools/testing/selftests/mm/protection_keys.c
+++ b/tools/testing/selftests/mm/protection_keys.c
@@ -42,7 +42,7 @@
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <sys/ptrace.h>
#include <setjmp.h>
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 3157204b9047..36045edb10de 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -265,6 +265,7 @@ CATEGORY="hugetlb" run_test ./map_hugetlb
CATEGORY="hugetlb" run_test ./hugepage-mremap
CATEGORY="hugetlb" run_test ./hugepage-vmemmap
CATEGORY="hugetlb" run_test ./hugetlb-madvise
+CATEGORY="hugetlb" run_test ./hugetlb_dio
nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
# For this test, we need one and just one huge page
@@ -331,6 +332,12 @@ CATEGORY="hugetlb" run_test ./thuge-gen
CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2
CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2
if $RUN_DESTRUCTIVE; then
+nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
+enable_soft_offline=$(cat /proc/sys/vm/enable_soft_offline)
+echo 8 > /proc/sys/vm/nr_hugepages
+CATEGORY="hugetlb" run_test ./hugetlb-soft-offline
+echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
+echo "$enable_soft_offline" > /proc/sys/vm/enable_soft_offline
CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
fi
@@ -367,8 +374,11 @@ CATEGORY="hmm" run_test bash ./test_hmm.sh smoke
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+if [ -x ./memfd_secret ]
+then
(echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix
CATEGORY="memfd_secret" run_test ./memfd_secret
+fi
# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
CATEGORY="ksm" run_test ./ksm_tests -H -s 100
diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c
index f2babec79bb6..7aa1366063e4 100644
--- a/tools/testing/selftests/mm/seal_elf.c
+++ b/tools/testing/selftests/mm/seal_elf.c
@@ -2,7 +2,7 @@
#define _GNU_SOURCE
#include <sys/mman.h>
#include <stdint.h>
-#include <unistd.h>
+#include <asm-generic/unistd.h>
#include <string.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -16,38 +16,7 @@
#include <sys/ioctl.h>
#include <sys/vfs.h>
#include <sys/stat.h>
-
-/*
- * need those definition for manually build using gcc.
- * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 seal_elf.c -o seal_elf
- */
-#define FAIL_TEST_IF_FALSE(c) do {\
- if (!(c)) {\
- ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\
- goto test_end;\
- } \
- } \
- while (0)
-
-#define SKIP_TEST_IF_FALSE(c) do {\
- if (!(c)) {\
- ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\
- goto test_end;\
- } \
- } \
- while (0)
-
-
-#define TEST_END_CHECK() {\
- ksft_test_result_pass("%s\n", __func__);\
- return;\
-test_end:\
- return;\
-}
-
-#ifndef u64
-#define u64 unsigned long long
-#endif
+#include "mseal_helpers.h"
/*
* define sys_xyx to call syscall directly.
@@ -158,7 +127,7 @@ static void test_seal_elf(void)
FAIL_TEST_IF_FALSE(ret < 0);
ksft_print_msg("somestr is sealed, mprotect is rejected\n");
- TEST_END_CHECK();
+ REPORT_TEST_PASS();
}
int main(int argc, char **argv)
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index d3c7f5fb3e7b..e5e8dafc9d94 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -300,7 +300,7 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
char **addr)
{
size_t i;
- int __attribute__((unused)) dummy = 0;
+ int dummy = 0;
srand(time(NULL));
@@ -341,6 +341,7 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
for (size_t i = 0; i < fd_size; i++)
dummy += *(*addr + i);
+ asm volatile("" : "+r" (dummy));
if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) {
ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n");
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index ea7fd8fe2876..e4370b79b62f 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -13,8 +13,9 @@
sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
(warning this will remove all if someone else uses them) */
-#define _GNU_SOURCE 1
+#define _GNU_SOURCE
#include <sys/mman.h>
+#include <linux/mman.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/ipc.h>
@@ -28,19 +29,23 @@
#include "vm_util.h"
#include "../kselftest.h"
-#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_SHIFT 26
-#define MAP_HUGE_MASK 0x3f
#if !defined(MAP_HUGETLB)
#define MAP_HUGETLB 0x40000
#endif
#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
+#ifndef SHM_HUGE_SHIFT
#define SHM_HUGE_SHIFT 26
+#endif
+#ifndef SHM_HUGE_MASK
#define SHM_HUGE_MASK 0x3f
+#endif
+#ifndef SHM_HUGE_2MB
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
+#endif
+#ifndef SHM_HUGE_1GB
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
+#endif
#define NUM_PAGESIZES 5
#define NUM_PAGES 4
diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c
index 7ad6ba660c7d..717539eddf98 100644
--- a/tools/testing/selftests/mm/uffd-common.c
+++ b/tools/testing/selftests/mm/uffd-common.c
@@ -673,11 +673,7 @@ int uffd_open_dev(unsigned int flags)
int uffd_open_sys(unsigned int flags)
{
-#ifdef __NR_userfaultfd
return syscall(__NR_userfaultfd, flags);
-#else
- return -1;
-#endif
}
int uffd_open(unsigned int flags)
diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c
index f78bab0f3d45..a4b83280998a 100644
--- a/tools/testing/selftests/mm/uffd-stress.c
+++ b/tools/testing/selftests/mm/uffd-stress.c
@@ -33,10 +33,10 @@
* pthread_mutex_lock will also verify the atomicity of the memory
* transfer (UFFDIO_COPY).
*/
-
+#include <asm-generic/unistd.h>
#include "uffd-common.h"
-#ifdef __NR_userfaultfd
+uint64_t features;
#define BOUNCE_RANDOM (1<<0)
#define BOUNCE_RACINGFAULTS (1<<1)
@@ -247,10 +247,14 @@ static int userfaultfd_stress(void)
unsigned long nr;
struct uffd_args args[nr_cpus];
uint64_t mem_size = nr_pages * page_size;
+ int flags = 0;
memset(args, 0, sizeof(struct uffd_args) * nr_cpus);
- if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL))
+ if (features & UFFD_FEATURE_WP_UNPOPULATED && test_type == TEST_ANON)
+ flags = UFFD_FEATURE_WP_UNPOPULATED;
+
+ if (uffd_test_ctx_init(flags, NULL))
err("context init failed");
if (posix_memalign(&area, page_size, page_size))
@@ -385,8 +389,6 @@ static void set_test_type(const char *type)
static void parse_test_type_arg(const char *raw_type)
{
- uint64_t features = UFFD_API_FEATURES;
-
set_test_type(raw_type);
if (!test_type)
@@ -409,12 +411,15 @@ static void parse_test_type_arg(const char *raw_type)
* feature.
*/
- if (userfaultfd_open(&features))
- err("Userfaultfd open failed");
+ if (uffd_get_features(&features))
+ err("failed to get available features");
test_uffdio_wp = test_uffdio_wp &&
(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+ if (test_type != TEST_ANON && !(features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM))
+ test_uffdio_wp = false;
+
close(uffd);
uffd = -1;
}
@@ -466,15 +471,3 @@ int main(int argc, char **argv)
nr_pages, nr_pages_per_cpu);
return userfaultfd_stress();
}
-
-#else /* __NR_userfaultfd */
-
-#warning "missing __NR_userfaultfd definition"
-
-int main(void)
-{
- printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
- return KSFT_SKIP;
-}
-
-#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 21ec23206ab4..b3d21eed203d 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -5,12 +5,11 @@
* Copyright (C) 2015-2023 Red Hat, Inc.
*/
+#include <asm-generic/unistd.h>
#include "uffd-common.h"
#include "../../../../mm/gup_test.h"
-#ifdef __NR_userfaultfd
-
/* The unit test doesn't need a large or random size, make it 32MB for now */
#define UFFD_TEST_MEM_SIZE (32UL << 20)
@@ -1554,14 +1553,3 @@ int main(int argc, char *argv[])
return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS;
}
-#else /* __NR_userfaultfd */
-
-#warning "missing __NR_userfaultfd definition"
-
-int main(void)
-{
- printf("Skipping %s (missing __NR_userfaultfd)\n", __file__);
- return KSFT_SKIP;
-}
-
-#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
index cfbc501290d3..896b3f73fc53 100644
--- a/tools/testing/selftests/mm/va_high_addr_switch.c
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -9,26 +9,9 @@
#include <sys/mman.h>
#include <string.h>
+#include "vm_util.h"
#include "../kselftest.h"
-#ifdef __powerpc64__
-#define PAGE_SIZE (64 << 10)
-/*
- * This will work with 16M and 2M hugepage size
- */
-#define HUGETLB_SIZE (16 << 20)
-#elif __aarch64__
-/*
- * The default hugepage size for 64k base pagesize
- * is 512MB.
- */
-#define PAGE_SIZE (64 << 10)
-#define HUGETLB_SIZE (512 << 20)
-#else
-#define PAGE_SIZE (4 << 10)
-#define HUGETLB_SIZE (2 << 20)
-#endif
-
/*
* The hint addr value is used to allocate addresses
* beyond the high address switch boundary.
@@ -37,18 +20,8 @@
#define ADDR_MARK_128TB (1UL << 47)
#define ADDR_MARK_256TB (1UL << 48)
-#define HIGH_ADDR_128TB ((void *) (1UL << 48))
-#define HIGH_ADDR_256TB ((void *) (1UL << 49))
-
-#define LOW_ADDR ((void *) (1UL << 30))
-
-#ifdef __aarch64__
-#define ADDR_SWITCH_HINT ADDR_MARK_256TB
-#define HIGH_ADDR HIGH_ADDR_256TB
-#else
-#define ADDR_SWITCH_HINT ADDR_MARK_128TB
-#define HIGH_ADDR HIGH_ADDR_128TB
-#endif
+#define HIGH_ADDR_128TB (1UL << 48)
+#define HIGH_ADDR_256TB (1UL << 49)
struct testcase {
void *addr;
@@ -59,195 +32,230 @@ struct testcase {
unsigned int keep_mapped:1;
};
-static struct testcase testcases[] = {
- {
- /*
- * If stack is moved, we could possibly allocate
- * this at the requested address.
- */
- .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
- .size = PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
- .low_addr_required = 1,
- },
- {
- /*
- * Unless MAP_FIXED is specified, allocation based on hint
- * addr is never at requested address or above it, which is
- * beyond high address switch boundary in this case. Instead,
- * a suitable allocation is found in lower address space.
- */
- .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
- .low_addr_required = 1,
- },
- {
- /*
- * Exact mapping at high address switch boundary, should
- * be obtained even without MAP_FIXED as area is free.
- */
- .addr = ((void *)(ADDR_SWITCH_HINT)),
- .size = PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
- .keep_mapped = 1,
- },
- {
- .addr = (void *)(ADDR_SWITCH_HINT),
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
- .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
- },
- {
- .addr = NULL,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(NULL)",
- .low_addr_required = 1,
- },
- {
- .addr = LOW_ADDR,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(LOW_ADDR)",
- .low_addr_required = 1,
- },
- {
- .addr = HIGH_ADDR,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(HIGH_ADDR)",
- .keep_mapped = 1,
- },
- {
- .addr = HIGH_ADDR,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(HIGH_ADDR) again",
- .keep_mapped = 1,
- },
- {
- .addr = HIGH_ADDR,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
- .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
- },
- {
- .addr = (void *) -1,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(-1)",
- .keep_mapped = 1,
- },
- {
- .addr = (void *) -1,
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(-1) again",
- },
- {
- .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
- .size = PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
- .low_addr_required = 1,
- },
- {
- .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
- .low_addr_required = 1,
- .keep_mapped = 1,
- },
- {
- .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
- .low_addr_required = 1,
- .keep_mapped = 1,
- },
- {
- .addr = ((void *)(ADDR_SWITCH_HINT)),
- .size = PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
- },
- {
- .addr = (void *)(ADDR_SWITCH_HINT),
- .size = 2 * PAGE_SIZE,
- .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
- .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
- },
-};
+static struct testcase *testcases;
+static struct testcase *hugetlb_testcases;
+static int sz_testcases, sz_hugetlb_testcases;
+static unsigned long switch_hint;
-static struct testcase hugetlb_testcases[] = {
- {
- .addr = NULL,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(NULL, MAP_HUGETLB)",
- .low_addr_required = 1,
- },
- {
- .addr = LOW_ADDR,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
- .low_addr_required = 1,
- },
- {
- .addr = HIGH_ADDR,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
- .keep_mapped = 1,
- },
- {
- .addr = HIGH_ADDR,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
- .keep_mapped = 1,
- },
- {
- .addr = HIGH_ADDR,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
- .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
- },
- {
- .addr = (void *) -1,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(-1, MAP_HUGETLB)",
- .keep_mapped = 1,
- },
- {
- .addr = (void *) -1,
- .size = HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(-1, MAP_HUGETLB) again",
- },
- {
- .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
- .size = 2 * HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
- .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
- .low_addr_required = 1,
- .keep_mapped = 1,
- },
- {
- .addr = (void *)(ADDR_SWITCH_HINT),
- .size = 2 * HUGETLB_SIZE,
- .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
- .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
- },
-};
+/* Initialize testcases inside a function to compute parameters at runtime */
+void testcases_init(void)
+{
+ unsigned long pagesize = getpagesize();
+ unsigned long hugepagesize = default_huge_page_size();
+ unsigned long low_addr = (1UL << 30);
+ unsigned long addr_switch_hint = ADDR_MARK_128TB;
+ unsigned long high_addr = HIGH_ADDR_128TB;
+
+#ifdef __aarch64__
+
+ /* Post LPA2, the lower userspace VA on a 16K pagesize is 47 bits. */
+ if (pagesize != (16UL << 10)) {
+ addr_switch_hint = ADDR_MARK_256TB;
+ high_addr = HIGH_ADDR_256TB;
+ }
+#endif
+
+ struct testcase t[] = {
+ {
+ /*
+ * If stack is moved, we could possibly allocate
+ * this at the requested address.
+ */
+ .addr = ((void *)(addr_switch_hint - pagesize)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, pagesize)",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * Unless MAP_FIXED is specified, allocation based on hint
+ * addr is never at requested address or above it, which is
+ * beyond high address switch boundary in this case. Instead,
+ * a suitable allocation is found in lower address space.
+ */
+ .addr = ((void *)(addr_switch_hint - pagesize)),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, (2 * pagesize))",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * Exact mapping at high address switch boundary, should
+ * be obtained even without MAP_FIXED as area is free.
+ */
+ .addr = ((void *)(addr_switch_hint)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint, pagesize)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED)",
+ },
+ {
+ .addr = NULL,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)low_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(low_addr)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(high_addr, MAP_FIXED)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1) again",
+ },
+ {
+ .addr = ((void *)(addr_switch_hint - pagesize)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, pagesize)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint - pagesize),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, 2 * pagesize)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint - pagesize / 2),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize/2 , 2 * pagesize)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = ((void *)(addr_switch_hint)),
+ .size = pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint, pagesize)",
+ },
+ {
+ .addr = (void *)(addr_switch_hint),
+ .size = 2 * pagesize,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED)",
+ },
+ };
+
+ struct testcase ht[] = {
+ {
+ .addr = NULL,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)low_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(low_addr, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(high_addr, MAP_HUGETLB) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)high_addr,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(high_addr, MAP_FIXED | MAP_HUGETLB)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB) again",
+ },
+ {
+ .addr = (void *)(addr_switch_hint - pagesize),
+ .size = 2 * hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(addr_switch_hint - pagesize, 2*hugepagesize, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(addr_switch_hint),
+ .size = 2 * hugepagesize,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB)",
+ },
+ };
+
+ testcases = malloc(sizeof(t));
+ hugetlb_testcases = malloc(sizeof(ht));
+
+ /* Copy into global arrays */
+ memcpy(testcases, t, sizeof(t));
+ memcpy(hugetlb_testcases, ht, sizeof(ht));
+
+ sz_testcases = ARRAY_SIZE(t);
+ sz_hugetlb_testcases = ARRAY_SIZE(ht);
+ switch_hint = addr_switch_hint;
+}
static int run_test(struct testcase *test, int count)
{
@@ -267,7 +275,7 @@ static int run_test(struct testcase *test, int count)
continue;
}
- if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+ if (t->low_addr_required && p >= (void *)(switch_hint)) {
printf("FAILED\n");
ret = KSFT_FAIL;
} else {
@@ -285,6 +293,20 @@ static int run_test(struct testcase *test, int count)
return ret;
}
+#ifdef __aarch64__
+/* Check if userspace VA > 48 bits */
+static int high_address_present(void)
+{
+ void *ptr = mmap((void *)(1UL << 50), 1, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (ptr == MAP_FAILED)
+ return 0;
+
+ munmap(ptr, 1);
+ return 1;
+}
+#endif
+
static int supported_arch(void)
{
#if defined(__powerpc64__)
@@ -292,7 +314,7 @@ static int supported_arch(void)
#elif defined(__x86_64__)
return 1;
#elif defined(__aarch64__)
- return getpagesize() == PAGE_SIZE;
+ return high_address_present();
#else
return 0;
#endif
@@ -305,8 +327,10 @@ int main(int argc, char **argv)
if (!supported_arch())
return KSFT_SKIP;
- ret = run_test(testcases, ARRAY_SIZE(testcases));
+ testcases_init();
+
+ ret = run_test(testcases, sz_testcases);
if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
- ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+ ret = run_test(hugetlb_testcases, sz_hugetlb_testcases);
return ret;
}
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index a0a75f302904..2c725773cd79 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -57,8 +57,4 @@ check_test_requirements()
}
check_test_requirements
-./va_high_addr_switch
-
-# In order to run hugetlb testcases, "--run-hugetlb" must be appended
-# to the binary.
./va_high_addr_switch --run-hugetlb
diff --git a/tools/testing/selftests/mqueue/mq_perf_tests.c b/tools/testing/selftests/mqueue/mq_perf_tests.c
index 5c16159d0bcd..fb898850867c 100644
--- a/tools/testing/selftests/mqueue/mq_perf_tests.c
+++ b/tools/testing/selftests/mqueue/mq_perf_tests.c
@@ -323,7 +323,8 @@ void *fake_cont_thread(void *arg)
void *cont_thread(void *arg)
{
char buff[MSG_SIZE];
- int i, priority;
+ int i;
+ unsigned int priority;
for (i = 0; i < num_cpus_to_pin; i++)
if (cpu_threads[i] == pthread_self())
@@ -425,7 +426,8 @@ struct test test2[] = {
void *perf_test_thread(void *arg)
{
char buff[MSG_SIZE];
- int prio_out, prio_in;
+ int prio_out;
+ unsigned int prio_in;
int i;
clockid_t clock;
pthread_t *t;
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index bc3925200637..8eaffd7a641c 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for net selftests
-CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
+CFLAGS += -Wall -Wl,--no-as-needed -O2 -g
CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES)
# Additional include paths needed by kselftest.h
CFLAGS += -I../
diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c
index 16d0c172eaeb..535eb2c3d7d1 100644
--- a/tools/testing/selftests/net/af_unix/msg_oob.c
+++ b/tools/testing/selftests/net/af_unix/msg_oob.c
@@ -209,7 +209,7 @@ static void __sendpair(struct __test_metadata *_metadata,
static void __recvpair(struct __test_metadata *_metadata,
FIXTURE_DATA(msg_oob) *self,
- const void *expected_buf, int expected_len,
+ const char *expected_buf, int expected_len,
int buf_len, int flags)
{
int i, ret[2], recv_errno[2], expected_errno = 0;
diff --git a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh
index 0760a34b7114..a21b7085da2e 100755
--- a/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_fdb_learning_limit.sh
@@ -178,6 +178,22 @@ fdb_del()
check_err $? "Failed to remove a FDB entry of type ${type}"
}
+check_fdb_n_learned_support()
+{
+ if ! ip link help bridge 2>&1 | grep -q "fdb_max_learned"; then
+ echo "SKIP: iproute2 too old, missing bridge max learned support"
+ exit $ksft_skip
+ fi
+
+ ip link add dev br0 type bridge
+ local learned=$(fdb_get_n_learned)
+ ip link del dev br0
+ if [ "$learned" == "null" ]; then
+ echo "SKIP: kernel too old; bridge fdb_n_learned feature not supported."
+ exit $ksft_skip
+ fi
+}
+
check_accounting_one_type()
{
local type=$1 is_counted=$2 overrides_learned=$3
@@ -274,6 +290,8 @@ check_limit()
done
}
+check_fdb_n_learned_support
+
trap cleanup EXIT
setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
index 64bd00fe9a4f..90f8a244ea90 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
@@ -1,7 +1,7 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
-ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn"
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid"
NUM_NETIFS=4
CHECK_TC="yes"
source lib.sh
@@ -142,6 +142,58 @@ extern_learn()
bridge fdb del de:ad:be:ef:13:37 dev $swp1 master vlan 1 &> /dev/null
}
+other_tpid()
+{
+ local mac=de:ad:be:ef:13:37
+
+ # Test that packets with TPID 802.1ad VID 3 + TPID 802.1Q VID 5 are
+ # classified as untagged by a bridge with vlan_protocol 802.1Q, and
+ # are processed in the PVID of the ingress port (here 1). Not VID 3,
+ # and not VID 5.
+ RET=0
+
+ tc qdisc add dev $h2 clsact
+ tc filter add dev $h2 ingress protocol all pref 1 handle 101 \
+ flower dst_mac $mac action drop
+ ip link set $h2 promisc on
+ ethtool -K $h2 rx-vlan-filter off rx-vlan-stag-filter off
+
+ $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa"
+ sleep 1
+
+ # Match on 'self' addresses as well, for those drivers which
+ # do not push their learned addresses to the bridge software
+ # database
+ bridge -j fdb show $swp1 | \
+ jq -e ".[] | select(.mac == \"$(mac_get $h1)\") | select(.vlan == 1)" &> /dev/null
+ check_err $? "FDB entry was not learned when it should"
+
+ log_test "FDB entry in PVID for VLAN-tagged with other TPID"
+
+ RET=0
+ tc -j -s filter show dev $h2 ingress \
+ | jq -e ".[] | select(.options.handle == 101) \
+ | select(.options.actions[0].stats.packets == 1)" &> /dev/null
+ check_err $? "Packet was not forwarded when it should"
+ log_test "Reception of VLAN with other TPID as untagged"
+
+ bridge vlan del dev $swp1 vid 1
+
+ $MZ -q $h1 -c 1 -b $mac -a own "88:a8 00:03 81:00 00:05 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa"
+ sleep 1
+
+ RET=0
+ tc -j -s filter show dev $h2 ingress \
+ | jq -e ".[] | select(.options.handle == 101) \
+ | select(.options.actions[0].stats.packets == 1)" &> /dev/null
+ check_err $? "Packet was forwarded when should not"
+ log_test "Reception of VLAN with other TPID as untagged (no PVID)"
+
+ bridge vlan add dev $swp1 vid 1 pvid untagged
+ ip link set $h2 promisc off
+ tc qdisc del dev $h2 clsact
+}
+
trap cleanup EXIT
setup_prepare
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index ff96bb7535ff..718d04a4f72d 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -500,6 +500,11 @@ check_err_fail()
fi
}
+xfail()
+{
+ FAIL_TO_XFAIL=yes "$@"
+}
+
xfail_on_slow()
{
if [[ $KSFT_MACHINE_SLOW = yes ]]; then
@@ -1113,6 +1118,39 @@ mac_get()
ip -j link show dev $if_name | jq -r '.[]["address"]'
}
+ether_addr_to_u64()
+{
+ local addr="$1"
+ local order="$((1 << 40))"
+ local val=0
+ local byte
+
+ addr="${addr//:/ }"
+
+ for byte in $addr; do
+ byte="0x$byte"
+ val=$((val + order * byte))
+ order=$((order >> 8))
+ done
+
+ printf "0x%x" $val
+}
+
+u64_to_ether_addr()
+{
+ local val=$1
+ local byte
+ local i
+
+ for ((i = 40; i >= 0; i -= 8)); do
+ byte=$(((val & (0xff << i)) >> i))
+ printf "%02x" $byte
+ if [ $i -ne 0 ]; then
+ printf ":"
+ fi
+ done
+}
+
ipv6_lladdr_get()
{
local if_name=$1
@@ -2229,3 +2267,22 @@ absval()
echo $((v > 0 ? v : -v))
}
+
+has_unicast_flt()
+{
+ local dev=$1; shift
+ local mac_addr=$(mac_get $dev)
+ local tmp=$(ether_addr_to_u64 $mac_addr)
+ local promisc
+
+ ip link set $dev up
+ ip link add link $dev name macvlan-tmp type macvlan mode private
+ ip link set macvlan-tmp address $(u64_to_ether_addr $((tmp + 1)))
+ ip link set macvlan-tmp up
+
+ promisc=$(ip -j -d link show dev $dev | jq -r '.[].promiscuity')
+
+ ip link del macvlan-tmp
+
+ [[ $promisc == 1 ]] && echo "no" || echo "yes"
+}
diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
index 4b364cdf3ef0..648868f74604 100755
--- a/tools/testing/selftests/net/forwarding/local_termination.sh
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -1,7 +1,9 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
-ALL_TESTS="standalone bridge"
+ALL_TESTS="standalone vlan_unaware_bridge vlan_aware_bridge test_vlan \
+ vlan_over_vlan_unaware_bridged_port vlan_over_vlan_aware_bridged_port \
+ vlan_over_vlan_unaware_bridge vlan_over_vlan_aware_bridge"
NUM_NETIFS=2
PING_COUNT=1
REQUIRE_MTOOLS=yes
@@ -37,9 +39,68 @@ UNKNOWN_MACV6_MC_ADDR1="33:33:01:02:03:05"
UNKNOWN_MACV6_MC_ADDR2="33:33:01:02:03:06"
UNKNOWN_MACV6_MC_ADDR3="33:33:01:02:03:07"
-NON_IP_MC="01:02:03:04:05:06"
-NON_IP_PKT="00:04 48:45:4c:4f"
-BC="ff:ff:ff:ff:ff:ff"
+PTP_1588_L2_SYNC=" \
+01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 00 02 \
+00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \
+00 00 00 00 00 00 00 00 00 00"
+PTP_1588_L2_FOLLOW_UP=" \
+01:1b:19:00:00:00 00:00:de:ad:be:ef 88:f7 08 02 \
+00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \
+00 00 66 83 c5 f1 17 97 ed f0"
+PTP_1588_L2_PDELAY_REQ=" \
+01:80:c2:00:00:0e 00:00:de:ad:be:ef 88:f7 02 02 \
+00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 06 05 7f \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 00 00"
+PTP_1588_IPV4_SYNC=" \
+01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \
+00 48 0a 9a 40 00 01 11 cb 88 c0 00 02 01 e0 00 \
+01 81 01 3f 01 3f 00 34 a3 c8 00 02 00 2c 00 00 \
+02 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \
+63 ff fe cf 17 0e 00 01 00 00 00 00 00 00 00 00 \
+00 00 00 00 00 00"
+PTP_1588_IPV4_FOLLOW_UP="
+01:00:5e:00:01:81 00:00:de:ad:be:ef 08:00 45 00 \
+00 48 0a 9b 40 00 01 11 cb 87 c0 00 02 01 e0 00 \
+01 81 01 40 01 40 00 34 a3 c8 08 02 00 2c 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \
+63 ff fe cf 17 0e 00 01 00 00 02 00 00 00 66 83 \
+c6 0f 1d 9a 61 87"
+PTP_1588_IPV4_PDELAY_REQ=" \
+01:00:5e:00:00:6b 00:00:de:ad:be:ef 08:00 45 00 \
+00 52 35 a9 40 00 01 11 a1 85 c0 00 02 01 e0 00 \
+00 6b 01 3f 01 3f 00 3e a2 bc 02 02 00 36 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 3e 37 \
+63 ff fe cf 17 0e 00 01 00 01 05 7f 00 00 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00"
+PTP_1588_IPV6_SYNC=" \
+33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 06 \
+7c 2f 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \
+00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \
+00 00 00 00 01 81 01 3f 01 3f 00 36 2e 92 00 02 \
+00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 00 00 \
+00 00 00 00 00 00 00 00 00 00 00 00"
+PTP_1588_IPV6_FOLLOW_UP=" \
+33:33:00:00:01:81 00:00:de:ad:be:ef 86:dd 60 0a \
+00 bc 00 36 11 01 20 01 0d b8 00 01 00 00 00 00 \
+00 00 00 00 00 01 ff 0e 00 00 00 00 00 00 00 00 \
+00 00 00 00 01 81 01 40 01 40 00 36 2e 92 08 02 \
+00 2c 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 00 02 00 \
+00 00 66 83 c6 2a 32 09 bd 74 00 00"
+PTP_1588_IPV6_PDELAY_REQ=" \
+33:33:00:00:00:6b 00:00:de:ad:be:ef 86:dd 60 0c \
+5c fd 00 40 11 01 fe 80 00 00 00 00 00 00 3c 37 \
+63 ff fe cf 17 0e ff 02 00 00 00 00 00 00 00 00 \
+00 00 00 00 00 6b 01 3f 01 3f 00 40 b4 54 02 02 \
+00 36 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 3e 37 63 ff fe cf 17 0e 00 01 00 01 05 7f \
+00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \
+00 00 00 00 00 00"
# Disable promisc to ensure we don't receive unknown MAC DA packets
export TCPDUMP_EXTRA_FLAGS="-pl"
@@ -47,13 +108,15 @@ export TCPDUMP_EXTRA_FLAGS="-pl"
h1=${NETIFS[p1]}
h2=${NETIFS[p2]}
-send_non_ip()
+send_raw()
{
- local if_name=$1
- local smac=$2
- local dmac=$3
+ local if_name=$1; shift
+ local pkt="$1"; shift
+ local smac=$(mac_get $if_name)
+
+ pkt="${pkt/00:00:de:ad:be:ef/$smac}"
- $MZ -q $if_name "$dmac $smac $NON_IP_PKT"
+ $MZ -q $if_name "$pkt"
}
send_uc_ipv4()
@@ -68,10 +131,11 @@ send_uc_ipv4()
check_rcv()
{
- local if_name=$1
- local type=$2
- local pattern=$3
- local should_receive=$4
+ local if_name=$1; shift
+ local type=$1; shift
+ local pattern=$1; shift
+ local should_receive=$1; shift
+ local test_name="$1"; shift
local should_fail=
[ $should_receive = true ] && should_fail=0 || should_fail=1
@@ -81,7 +145,7 @@ check_rcv()
check_err_fail "$should_fail" "$?" "reception"
- log_test "$if_name: $type"
+ log_test "$test_name: $type"
}
mc_route_prepare()
@@ -104,44 +168,78 @@ mc_route_destroy()
run_test()
{
- local rcv_if_name=$1
- local smac=$(mac_get $h1)
+ local send_if_name=$1; shift
+ local rcv_if_name=$1; shift
+ local skip_ptp=$1; shift
+ local no_unicast_flt=$1; shift
+ local test_name="$1"; shift
+ local smac=$(mac_get $send_if_name)
local rcv_dmac=$(mac_get $rcv_if_name)
+ local should_receive
tcpdump_start $rcv_if_name
- mc_route_prepare $h1
+ mc_route_prepare $send_if_name
mc_route_prepare $rcv_if_name
- send_uc_ipv4 $h1 $rcv_dmac
- send_uc_ipv4 $h1 $MACVLAN_ADDR
- send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR1
+ send_uc_ipv4 $send_if_name $rcv_dmac
+ send_uc_ipv4 $send_if_name $MACVLAN_ADDR
+ send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR1
ip link set dev $rcv_if_name promisc on
- send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR2
- mc_send $h1 $UNKNOWN_IPV4_MC_ADDR2
- mc_send $h1 $UNKNOWN_IPV6_MC_ADDR2
+ send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR2
+ mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR2
+ mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR2
ip link set dev $rcv_if_name promisc off
mc_join $rcv_if_name $JOINED_IPV4_MC_ADDR
- mc_send $h1 $JOINED_IPV4_MC_ADDR
+ mc_send $send_if_name $JOINED_IPV4_MC_ADDR
mc_leave
mc_join $rcv_if_name $JOINED_IPV6_MC_ADDR
- mc_send $h1 $JOINED_IPV6_MC_ADDR
+ mc_send $send_if_name $JOINED_IPV6_MC_ADDR
mc_leave
- mc_send $h1 $UNKNOWN_IPV4_MC_ADDR1
- mc_send $h1 $UNKNOWN_IPV6_MC_ADDR1
+ mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR1
+ mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR1
ip link set dev $rcv_if_name allmulticast on
- send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR3
- mc_send $h1 $UNKNOWN_IPV4_MC_ADDR3
- mc_send $h1 $UNKNOWN_IPV6_MC_ADDR3
+ send_uc_ipv4 $send_if_name $UNKNOWN_UC_ADDR3
+ mc_send $send_if_name $UNKNOWN_IPV4_MC_ADDR3
+ mc_send $send_if_name $UNKNOWN_IPV6_MC_ADDR3
ip link set dev $rcv_if_name allmulticast off
mc_route_destroy $rcv_if_name
- mc_route_destroy $h1
+ mc_route_destroy $send_if_name
+
+ if [ $skip_ptp = false ]; then
+ ip maddress add 01:1b:19:00:00:00 dev $rcv_if_name
+ send_raw $send_if_name "$PTP_1588_L2_SYNC"
+ send_raw $send_if_name "$PTP_1588_L2_FOLLOW_UP"
+ ip maddress del 01:1b:19:00:00:00 dev $rcv_if_name
+
+ ip maddress add 01:80:c2:00:00:0e dev $rcv_if_name
+ send_raw $send_if_name "$PTP_1588_L2_PDELAY_REQ"
+ ip maddress del 01:80:c2:00:00:0e dev $rcv_if_name
+
+ mc_join $rcv_if_name 224.0.1.129
+ send_raw $send_if_name "$PTP_1588_IPV4_SYNC"
+ send_raw $send_if_name "$PTP_1588_IPV4_FOLLOW_UP"
+ mc_leave
+
+ mc_join $rcv_if_name 224.0.0.107
+ send_raw $send_if_name "$PTP_1588_IPV4_PDELAY_REQ"
+ mc_leave
+
+ mc_join $rcv_if_name ff0e::181
+ send_raw $send_if_name "$PTP_1588_IPV6_SYNC"
+ send_raw $send_if_name "$PTP_1588_IPV6_FOLLOW_UP"
+ mc_leave
+
+ mc_join $rcv_if_name ff02::6b
+ send_raw $send_if_name "$PTP_1588_IPV6_PDELAY_REQ"
+ mc_leave
+ fi
sleep 1
@@ -149,61 +247,99 @@ run_test()
check_rcv $rcv_if_name "Unicast IPv4 to primary MAC address" \
"$smac > $rcv_dmac, ethertype IPv4 (0x0800)" \
- true
+ true "$test_name"
check_rcv $rcv_if_name "Unicast IPv4 to macvlan MAC address" \
"$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \
- true
+ true "$test_name"
- xfail_on_veth $h1 \
- check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \
- "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \
- false
+ [ $no_unicast_flt = true ] && should_receive=true || should_receive=false
+ check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \
+ "$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \
+ $should_receive "$test_name"
check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \
"$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \
- true
+ true "$test_name"
- xfail_on_veth $h1 \
- check_rcv $rcv_if_name \
- "Unicast IPv4 to unknown MAC address, allmulti" \
- "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \
- false
+ [ $no_unicast_flt = true ] && should_receive=true || should_receive=false
+ check_rcv $rcv_if_name \
+ "Unicast IPv4 to unknown MAC address, allmulti" \
+ "$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \
+ $should_receive "$test_name"
check_rcv $rcv_if_name "Multicast IPv4 to joined group" \
"$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \
- true
+ true "$test_name"
- xfail_on_veth $h1 \
+ xfail \
check_rcv $rcv_if_name \
"Multicast IPv4 to unknown group" \
"$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \
- false
+ false "$test_name"
check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \
"$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \
- true
+ true "$test_name"
check_rcv $rcv_if_name "Multicast IPv4 to unknown group, allmulti" \
"$smac > $UNKNOWN_MACV4_MC_ADDR3, ethertype IPv4 (0x0800)" \
- true
+ true "$test_name"
check_rcv $rcv_if_name "Multicast IPv6 to joined group" \
"$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \
- true
+ true "$test_name"
- xfail_on_veth $h1 \
+ xfail \
check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \
"$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \
- false
+ false "$test_name"
check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \
"$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \
- true
+ true "$test_name"
check_rcv $rcv_if_name "Multicast IPv6 to unknown group, allmulti" \
"$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \
- true
+ true "$test_name"
+
+ if [ $skip_ptp = false ]; then
+ check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \
+ "ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \
+ "ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \
+ "ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \
+ "ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \
+ "ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \
+ "ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \
+ "ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \
+ "ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \
+ true "$test_name"
+
+ check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \
+ "ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \
+ true "$test_name"
+ fi
tcpdump_cleanup $rcv_if_name
}
@@ -228,57 +364,208 @@ h2_destroy()
simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64
}
+h1_vlan_create()
+{
+ simple_if_init $h1
+ vlan_create $h1 100 v$h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h1_vlan_destroy()
+{
+ vlan_destroy $h1 100
+ simple_if_fini $h1
+}
+
+h2_vlan_create()
+{
+ simple_if_init $h2
+ vlan_create $h2 100 v$h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+h2_vlan_destroy()
+{
+ vlan_destroy $h2 100
+ simple_if_fini $h2
+}
+
bridge_create()
{
- ip link add br0 type bridge
+ local vlan_filtering=$1
+
+ ip link add br0 type bridge vlan_filtering $vlan_filtering
ip link set br0 address $BRIDGE_ADDR
ip link set br0 up
ip link set $h2 master br0
ip link set $h2 up
-
- simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64
}
bridge_destroy()
{
- simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64
-
ip link del br0
}
-standalone()
+macvlan_create()
{
- h1_create
- h2_create
+ local lower=$1
- ip link add link $h2 name macvlan0 type macvlan mode private
+ ip link add link $lower name macvlan0 type macvlan mode private
ip link set macvlan0 address $MACVLAN_ADDR
ip link set macvlan0 up
+}
- run_test $h2
-
+macvlan_destroy()
+{
ip link del macvlan0
+}
+
+standalone()
+{
+ local no_unicast_flt=true
+ local skip_ptp=false
+
+ if [ $(has_unicast_flt $h2) = yes ]; then
+ no_unicast_flt=false
+ fi
+
+ h1_create
+ h2_create
+ macvlan_create $h2
+
+ run_test $h1 $h2 $skip_ptp $no_unicast_flt "$h2"
+ macvlan_destroy
h2_destroy
h1_destroy
}
-bridge()
+test_bridge()
{
+ local no_unicast_flt=true
+ local vlan_filtering=$1
+ local skip_ptp=true
+
h1_create
- bridge_create
+ bridge_create $vlan_filtering
+ simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64
+ macvlan_create br0
- ip link add link br0 name macvlan0 type macvlan mode private
- ip link set macvlan0 address $MACVLAN_ADDR
- ip link set macvlan0 up
+ run_test $h1 br0 $skip_ptp $no_unicast_flt \
+ "vlan_filtering=$vlan_filtering bridge"
- run_test br0
+ macvlan_destroy
+ simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64
+ bridge_destroy
+ h1_destroy
+}
- ip link del macvlan0
+vlan_unaware_bridge()
+{
+ test_bridge 0
+}
+
+vlan_aware_bridge()
+{
+ test_bridge 1
+}
+
+test_vlan()
+{
+ local no_unicast_flt=true
+ local skip_ptp=false
+
+ if [ $(has_unicast_flt $h2) = yes ]; then
+ no_unicast_flt=false
+ fi
+ h1_vlan_create
+ h2_vlan_create
+ macvlan_create $h2.100
+
+ run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt "VLAN upper"
+
+ macvlan_destroy
+ h2_vlan_destroy
+ h1_vlan_destroy
+}
+
+vlan_over_bridged_port()
+{
+ local no_unicast_flt=true
+ local vlan_filtering=$1
+ local skip_ptp=false
+
+ # br_manage_promisc() will not force a single vlan_filtering port to
+ # promiscuous mode, so we should still expect unicast filtering to take
+ # place if the device can do it.
+ if [ $(has_unicast_flt $h2) = yes ] && [ $vlan_filtering = 1 ]; then
+ no_unicast_flt=false
+ fi
+
+ h1_vlan_create
+ h2_vlan_create
+ bridge_create $vlan_filtering
+ macvlan_create $h2.100
+
+ run_test $h1.100 $h2.100 $skip_ptp $no_unicast_flt \
+ "VLAN over vlan_filtering=$vlan_filtering bridged port"
+
+ macvlan_destroy
bridge_destroy
- h1_destroy
+ h2_vlan_destroy
+ h1_vlan_destroy
+}
+
+vlan_over_vlan_unaware_bridged_port()
+{
+ vlan_over_bridged_port 0
+}
+
+vlan_over_vlan_aware_bridged_port()
+{
+ vlan_over_bridged_port 1
+}
+
+vlan_over_bridge()
+{
+ local no_unicast_flt=true
+ local vlan_filtering=$1
+ local skip_ptp=true
+
+ h1_vlan_create
+ bridge_create $vlan_filtering
+ simple_if_init br0
+ vlan_create br0 100 vbr0 $H2_IPV4/24 $H2_IPV6/64
+ macvlan_create br0.100
+
+ if [ $vlan_filtering = 1 ]; then
+ bridge vlan add dev $h2 vid 100 master
+ bridge vlan add dev br0 vid 100 self
+ fi
+
+ run_test $h1.100 br0.100 $skip_ptp $no_unicast_flt \
+ "VLAN over vlan_filtering=$vlan_filtering bridge"
+
+ if [ $vlan_filtering = 1 ]; then
+ bridge vlan del dev br0 vid 100 self
+ bridge vlan del dev $h2 vid 100 master
+ fi
+
+ macvlan_destroy
+ vlan_destroy br0 100
+ simple_if_fini br0
+ bridge_destroy
+ h1_vlan_destroy
+}
+
+vlan_over_vlan_unaware_bridge()
+{
+ vlan_over_bridge 0
+}
+
+vlan_over_vlan_aware_bridge()
+{
+ vlan_over_bridge 1
}
cleanup()
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index d0219032f773..8ee4489238ca 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -146,6 +146,7 @@ cleanup_ns()
for ns in "$@"; do
[ -z "${ns}" ] && continue
+ ip netns pids "${ns}" 2> /dev/null | xargs -r kill || true
ip netns delete "${ns}" &> /dev/null || true
if ! busywait $BUSYWAIT_TIMEOUT ip netns list \| grep -vq "^$ns$" &> /dev/null; then
echo "Warn: Failed to remove namespace $ns"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index d2043ec3bf6d..4209b9569039 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -1115,11 +1115,11 @@ again:
return 1;
}
- if (--cfg_repeat > 0) {
- if (cfg_input)
- close(fd);
+ if (cfg_input)
+ close(fd);
+
+ if (--cfg_repeat > 0)
goto again;
- }
return 0;
}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 108aeeb84ef1..89e553e0e0c2 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -436,9 +436,10 @@ reset_with_tcp_filter()
local ns="${!1}"
local src="${2}"
local target="${3}"
+ local chain="${4:-INPUT}"
if ! ip netns exec "${ns}" ${iptables} \
- -A INPUT \
+ -A "${chain}" \
-s "${src}" \
-p tcp \
-j "${target}"; then
@@ -661,7 +662,7 @@ pm_nl_check_endpoint()
done
if [ -z "${id}" ]; then
- test_fail "bad test - missing endpoint id"
+ fail_test "bad test - missing endpoint id"
return
fi
@@ -1415,18 +1416,28 @@ chk_add_nr()
local add_nr=$1
local echo_nr=$2
local port_nr=${3:-0}
- local syn_nr=${4:-$port_nr}
- local syn_ack_nr=${5:-$port_nr}
- local ack_nr=${6:-$port_nr}
- local mis_syn_nr=${7:-0}
- local mis_ack_nr=${8:-0}
+ local ns_invert=${4:-""}
+ local syn_nr=$port_nr
+ local syn_ack_nr=$port_nr
+ local ack_nr=$port_nr
+ local mis_syn_nr=0
+ local mis_ack_nr=0
+ local ns_tx=$ns1
+ local ns_rx=$ns2
+ local extra_msg=""
local count
local timeout
- timeout=$(ip netns exec $ns1 sysctl -n net.mptcp.add_addr_timeout)
+ if [[ $ns_invert = "invert" ]]; then
+ ns_tx=$ns2
+ ns_rx=$ns1
+ extra_msg="invert"
+ fi
+
+ timeout=$(ip netns exec ${ns_tx} sysctl -n net.mptcp.add_addr_timeout)
print_check "add"
- count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtAddAddr")
+ count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtAddAddr")
if [ -z "$count" ]; then
print_skip
# if the test configured a short timeout tolerate greater then expected
@@ -1438,7 +1449,7 @@ chk_add_nr()
fi
print_check "echo"
- count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtEchoAdd")
+ count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtEchoAdd")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$echo_nr" ]; then
@@ -1449,7 +1460,7 @@ chk_add_nr()
if [ $port_nr -gt 0 ]; then
print_check "pt"
- count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtPortAdd")
+ count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtPortAdd")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$port_nr" ]; then
@@ -1459,7 +1470,7 @@ chk_add_nr()
fi
print_check "syn"
- count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortSynRx")
+ count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortSynRx")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$syn_nr" ]; then
@@ -1470,7 +1481,7 @@ chk_add_nr()
fi
print_check "synack"
- count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinPortSynAckRx")
+ count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtMPJoinPortSynAckRx")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$syn_ack_nr" ]; then
@@ -1481,7 +1492,7 @@ chk_add_nr()
fi
print_check "ack"
- count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinPortAckRx")
+ count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMPJoinPortAckRx")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$ack_nr" ]; then
@@ -1492,7 +1503,7 @@ chk_add_nr()
fi
print_check "syn"
- count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortSynRx")
+ count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortSynRx")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$mis_syn_nr" ]; then
@@ -1503,7 +1514,7 @@ chk_add_nr()
fi
print_check "ack"
- count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMismatchPortAckRx")
+ count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtMismatchPortAckRx")
if [ -z "$count" ]; then
print_skip
elif [ "$count" != "$mis_ack_nr" ]; then
@@ -1513,6 +1524,8 @@ chk_add_nr()
print_ok
fi
fi
+
+ print_info "$extra_msg"
}
chk_add_tx_nr()
@@ -1634,6 +1647,8 @@ chk_prio_nr()
{
local mp_prio_nr_tx=$1
local mp_prio_nr_rx=$2
+ local mpj_syn=$3
+ local mpj_syn_ack=$4
local count
print_check "ptx"
@@ -1655,6 +1670,26 @@ chk_prio_nr()
else
print_ok
fi
+
+ print_check "syn backup"
+ count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinSynBackupRx")
+ if [ -z "$count" ]; then
+ print_skip
+ elif [ "$count" != "$mpj_syn" ]; then
+ fail_test "got $count JOIN[s] syn with Backup expected $mpj_syn"
+ else
+ print_ok
+ fi
+
+ print_check "synack backup"
+ count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckBackupRx")
+ if [ -z "$count" ]; then
+ print_skip
+ elif [ "$count" != "$mpj_syn_ack" ]; then
+ fail_test "got $count JOIN[s] synack with Backup expected $mpj_syn_ack"
+ else
+ print_ok
+ fi
}
chk_subflow_nr()
@@ -1955,6 +1990,21 @@ signal_address_tests()
chk_add_nr 1 1
fi
+ # uncommon: subflow and signal flags on the same endpoint
+ # or because the user wrongly picked both, but still expects the client
+ # to create additional subflows
+ if reset "subflow and signal together"; then
+ pm_nl_set_limits $ns1 0 2
+ pm_nl_set_limits $ns2 0 2
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags signal,subflow
+ run_tests $ns1 $ns2 10.0.1.1
+ chk_join_nr 1 1 1
+ chk_add_nr 1 1 0 invert # only initiated by ns2
+ chk_add_nr 0 0 0 # none initiated by ns1
+ chk_rst_nr 0 0 invert # no RST sent by the client
+ chk_rst_nr 0 0 # no RST sent by the server
+ fi
+
# accept and use add_addr with additional subflows
if reset "multiple subflows and signal"; then
pm_nl_set_limits $ns1 0 3
@@ -2612,33 +2662,46 @@ backup_tests()
sflags=nobackup speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 1
- chk_prio_nr 0 1
+ chk_prio_nr 0 1 1 0
fi
# single address, backup
if reset "single address, backup" &&
continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
pm_nl_set_limits $ns1 0 1
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
+ pm_nl_set_limits $ns2 1 1
+ sflags=nobackup speed=slow \
+ run_tests $ns1 $ns2 10.0.1.1
+ chk_join_nr 1 1 1
+ chk_add_nr 1 1
+ chk_prio_nr 1 0 0 1
+ fi
+
+ # single address, switch to backup
+ if reset "single address, switch to backup" &&
+ continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+ pm_nl_set_limits $ns1 0 1
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
pm_nl_set_limits $ns2 1 1
sflags=backup speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 1
chk_add_nr 1 1
- chk_prio_nr 1 1
+ chk_prio_nr 1 1 0 0
fi
# single address with port, backup
if reset "single address with port, backup" &&
continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
pm_nl_set_limits $ns1 0 1
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup port 10100
pm_nl_set_limits $ns2 1 1
- sflags=backup speed=slow \
+ sflags=nobackup speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 1
chk_add_nr 1 1
- chk_prio_nr 1 1
+ chk_prio_nr 1 0 0 1
fi
if reset "mpc backup" &&
@@ -2647,17 +2710,26 @@ backup_tests()
speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
- chk_prio_nr 0 1
+ chk_prio_nr 0 1 0 0
fi
if reset "mpc backup both sides" &&
continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then
- pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup
+ pm_nl_set_limits $ns1 0 2
+ pm_nl_set_limits $ns2 1 2
+ pm_nl_add_endpoint $ns1 10.0.1.1 flags signal,backup
pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup
+
+ # 10.0.2.2 (non-backup) -> 10.0.1.1 (backup)
+ pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
+ # 10.0.1.2 (backup) -> 10.0.2.1 (non-backup)
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ ip -net "$ns2" route add 10.0.2.1 via 10.0.1.1 dev ns2eth1 # force this path
+
speed=slow \
run_tests $ns1 $ns2 10.0.1.1
- chk_join_nr 0 0 0
- chk_prio_nr 1 1
+ chk_join_nr 2 2 2
+ chk_prio_nr 1 1 1 1
fi
if reset "mpc switch to backup" &&
@@ -2666,7 +2738,7 @@ backup_tests()
sflags=backup speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
- chk_prio_nr 0 1
+ chk_prio_nr 0 1 0 0
fi
if reset "mpc switch to backup both sides" &&
@@ -2676,7 +2748,7 @@ backup_tests()
sflags=backup speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
- chk_prio_nr 1 1
+ chk_prio_nr 1 1 0 0
fi
}
@@ -2987,6 +3059,7 @@ fullmesh_tests()
pm_nl_set_limits $ns1 1 3
pm_nl_set_limits $ns2 1 3
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,fullmesh
fullmesh=1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -3053,7 +3126,7 @@ fullmesh_tests()
addr_nr_ns2=1 sflags=backup,fullmesh speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
- chk_prio_nr 0 1
+ chk_prio_nr 0 1 1 0
chk_rm_nr 0 1
fi
@@ -3066,7 +3139,7 @@ fullmesh_tests()
sflags=nobackup,nofullmesh speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
- chk_prio_nr 0 1
+ chk_prio_nr 0 1 1 0
chk_rm_nr 0 1
fi
}
@@ -3318,7 +3391,7 @@ userspace_tests()
sflags=backup speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 0
- chk_prio_nr 0 0
+ chk_prio_nr 0 0 0 0
fi
# userspace pm type prevents rm_addr
@@ -3500,10 +3573,10 @@ endpoint_tests()
mptcp_lib_kill_wait $tests_pid
fi
- if reset "delete and re-add" &&
+ if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT &&
mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
- pm_nl_set_limits $ns1 1 1
- pm_nl_set_limits $ns2 1 1
+ pm_nl_set_limits $ns1 0 2
+ pm_nl_set_limits $ns2 0 2
pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
test_linkfail=4 speed=20 \
run_tests $ns1 $ns2 10.0.1.1 &
@@ -3520,11 +3593,94 @@ endpoint_tests()
chk_subflow_nr "after delete" 1
chk_mptcp_info subflows 0 subflows 0
- pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
wait_mpj $ns2
chk_subflow_nr "after re-add" 2
chk_mptcp_info subflows 1 subflows 1
+
+ pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+ wait_attempt_fail $ns2
+ chk_subflow_nr "after new reject" 2
+ chk_mptcp_info subflows 1 subflows 1
+
+ ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT
+ pm_nl_del_endpoint $ns2 3 10.0.3.2
+ pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+ wait_mpj $ns2
+ chk_subflow_nr "after no reject" 3
+ chk_mptcp_info subflows 2 subflows 2
+
mptcp_lib_kill_wait $tests_pid
+
+ chk_join_nr 3 3 3
+ chk_rm_nr 1 1
+ fi
+
+ # remove and re-add
+ if reset "delete re-add signal" &&
+ mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+ pm_nl_set_limits $ns1 0 2
+ pm_nl_set_limits $ns2 2 2
+ pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
+ # broadcast IP: no packet for this address will be received on ns1
+ pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
+ test_linkfail=4 speed=20 \
+ run_tests $ns1 $ns2 10.0.1.1 &
+ local tests_pid=$!
+
+ wait_mpj $ns2
+ pm_nl_check_endpoint "creation" \
+ $ns1 10.0.2.1 id 1 flags signal
+ chk_subflow_nr "before delete" 2
+ chk_mptcp_info subflows 1 subflows 1
+
+ pm_nl_del_endpoint $ns1 1 10.0.2.1
+ pm_nl_del_endpoint $ns1 2 224.0.0.1
+ sleep 0.5
+ chk_subflow_nr "after delete" 1
+ chk_mptcp_info subflows 0 subflows 0
+
+ pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
+ wait_mpj $ns2
+ chk_subflow_nr "after re-add" 3
+ chk_mptcp_info subflows 2 subflows 2
+ mptcp_lib_kill_wait $tests_pid
+
+ chk_join_nr 3 3 3
+ chk_add_nr 4 4
+ chk_rm_nr 2 1 invert
+ fi
+
+ # flush and re-add
+ if reset_with_tcp_filter "flush re-add" ns2 10.0.3.2 REJECT OUTPUT &&
+ mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
+ pm_nl_set_limits $ns1 0 2
+ pm_nl_set_limits $ns2 1 2
+ # broadcast IP: no packet for this address will be received on ns1
+ pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
+ pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+ test_linkfail=4 speed=20 \
+ run_tests $ns1 $ns2 10.0.1.1 &
+ local tests_pid=$!
+
+ wait_attempt_fail $ns2
+ chk_subflow_nr "before flush" 1
+ chk_mptcp_info subflows 0 subflows 0
+
+ pm_nl_flush_endpoint $ns2
+ pm_nl_flush_endpoint $ns1
+ wait_rm_addr $ns2 0
+ ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT
+ pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
+ wait_mpj $ns2
+ pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
+ wait_mpj $ns2
+ mptcp_lib_kill_wait $tests_pid
+
+ chk_join_nr 2 2 2
+ chk_add_nr 2 2
+ chk_rm_nr 1 0 invert
fi
}
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index 47945b2b3f92..d13fb5ea3e89 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -7,6 +7,7 @@ MNL_CFLAGS := $(shell $(HOSTPKG_CONFIG) --cflags libmnl 2>/dev/null)
MNL_LDLIBS := $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
TEST_PROGS := br_netfilter.sh bridge_brouter.sh
+TEST_PROGS += br_netfilter_queue.sh
TEST_PROGS += conntrack_icmp_related.sh
TEST_PROGS += conntrack_ipip_mtu.sh
TEST_PROGS += conntrack_tcp_unreplied.sh
diff --git a/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh
new file mode 100755
index 000000000000..6a764d70ab06
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/br_netfilter_queue.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+source lib.sh
+
+checktool "nft --version" "run test without nft tool"
+
+cleanup() {
+ cleanup_all_ns
+}
+
+setup_ns c1 c2 c3 sender
+
+trap cleanup EXIT
+
+nf_queue_wait()
+{
+ grep -q "^ *$1 " "/proc/self/net/netfilter/nfnetlink_queue"
+}
+
+port_add() {
+ ns="$1"
+ dev="$2"
+ a="$3"
+
+ ip link add name "$dev" type veth peer name "$dev" netns "$ns"
+
+ ip -net "$ns" addr add 192.168.1."$a"/24 dev "$dev"
+ ip -net "$ns" link set "$dev" up
+
+ ip link set "$dev" master br0
+ ip link set "$dev" up
+}
+
+[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
+
+ip link add br0 type bridge
+ip addr add 192.168.1.254/24 dev br0
+
+port_add "$c1" "c1" 1
+port_add "$c2" "c2" 2
+port_add "$c3" "c3" 3
+port_add "$sender" "sender" 253
+
+ip link set br0 up
+
+modprobe -q br_netfilter
+
+sysctl net.bridge.bridge-nf-call-iptables=1 || exit 1
+
+ip netns exec "$sender" ping -I sender -c1 192.168.1.1 || exit 1
+ip netns exec "$sender" ping -I sender -c1 192.168.1.2 || exit 2
+ip netns exec "$sender" ping -I sender -c1 192.168.1.3 || exit 3
+
+nft -f /dev/stdin <<EOF
+table ip filter {
+ chain forward {
+ type filter hook forward priority 0; policy accept;
+ ct state new counter
+ ip protocol icmp counter queue num 0 bypass
+ }
+}
+EOF
+./nf_queue -t 5 > /dev/null &
+
+busywait 5000 nf_queue_wait
+
+for i in $(seq 1 5); do conntrack -F > /dev/null 2> /dev/null; sleep 0.1 ; done &
+ip netns exec "$sender" ping -I sender -f -c 50 -b 192.168.1.255
+
+read t < /proc/sys/kernel/tainted
+if [ "$t" -eq 0 ];then
+ echo PASS: kernel not tainted
+else
+ echo ERROR: kernel is tainted
+ exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/net/tcp_ao/Makefile b/tools/testing/selftests/net/tcp_ao/Makefile
index 522d991e310e..bd88b90b902b 100644
--- a/tools/testing/selftests/net/tcp_ao/Makefile
+++ b/tools/testing/selftests/net/tcp_ao/Makefile
@@ -26,7 +26,7 @@ LIB := $(LIBDIR)/libaotst.a
LDLIBS += $(LIB) -pthread
LIBDEPS := lib/aolib.h Makefile
-CFLAGS := -Wall -O2 -g -D_GNU_SOURCE -fno-strict-aliasing
+CFLAGS += -Wall -O2 -g -fno-strict-aliasing
CFLAGS += $(KHDR_INCLUDES)
CFLAGS += -iquote ./lib/ -I ../../../../include/
diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
index 11a1ebda564f..d5ffd8c9172e 100755
--- a/tools/testing/selftests/net/udpgro.sh
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -7,8 +7,6 @@ source net_helper.sh
readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
-BPF_FILE="xdp_dummy.bpf.o"
-
# set global exit status, but never reset nonzero one.
check_err()
{
@@ -38,7 +36,7 @@ cfg_veth() {
ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
ip -netns "${PEER_NS}" link set dev veth1 up
- ip -n "${PEER_NS}" link set veth1 xdp object ${BPF_FILE} section xdp
+ ip netns exec "${PEER_NS}" ethtool -K veth1 gro on
}
run_one() {
@@ -46,17 +44,19 @@ run_one() {
local -r all="$@"
local -r tx_args=${all%rx*}
local -r rx_args=${all#*rx}
+ local ret=0
cfg_veth
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \
- echo "ok" || \
- echo "failed" &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} &
+ local PID1=$!
wait_local_port_listen ${PEER_NS} 8000 udp
./udpgso_bench_tx ${tx_args}
- ret=$?
- wait $(jobs -p)
+ check_err $?
+ wait ${PID1}
+ check_err $?
+ [ "$ret" -eq 0 ] && echo "ok" || echo "failed"
return $ret
}
@@ -73,6 +73,7 @@ run_one_nat() {
local -r all="$@"
local -r tx_args=${all%rx*}
local -r rx_args=${all#*rx}
+ local ret=0
if [[ ${tx_args} = *-4* ]]; then
ipt_cmd=iptables
@@ -93,16 +94,17 @@ run_one_nat() {
# ... so that GRO will match the UDP_GRO enabled socket, but packets
# will land on the 'plain' one
ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 &
- pid=$!
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \
- echo "ok" || \
- echo "failed"&
+ local PID1=$!
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} &
+ local PID2=$!
wait_local_port_listen "${PEER_NS}" 8000 udp
./udpgso_bench_tx ${tx_args}
- ret=$?
- kill -INT $pid
- wait $(jobs -p)
+ check_err $?
+ kill -INT ${PID1}
+ wait ${PID2}
+ check_err $?
+ [ "$ret" -eq 0 ] && echo "ok" || echo "failed"
return $ret
}
@@ -111,20 +113,26 @@ run_one_2sock() {
local -r all="$@"
local -r tx_args=${all%rx*}
local -r rx_args=${all#*rx}
+ local ret=0
cfg_veth
ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 &
- ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \
- echo "ok" || \
- echo "failed" &
+ local PID1=$!
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} &
+ local PID2=$!
wait_local_port_listen "${PEER_NS}" 12345 udp
./udpgso_bench_tx ${tx_args} -p 12345
+ check_err $?
wait_local_port_listen "${PEER_NS}" 8000 udp
./udpgso_bench_tx ${tx_args}
- ret=$?
- wait $(jobs -p)
+ check_err $?
+ wait ${PID1}
+ check_err $?
+ wait ${PID2}
+ check_err $?
+ [ "$ret" -eq 0 ] && echo "ok" || echo "failed"
return $ret
}
@@ -196,11 +204,6 @@ run_all() {
return $ret
}
-if [ ! -f ${BPF_FILE} ]; then
- echo "Missing ${BPF_FILE}. Run 'make' first"
- exit -1
-fi
-
if [[ $# -eq 0 ]]; then
run_all
elif [[ $1 == "__subprocess" ]]; then
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 3e74cfa1a2bf..3f2fca02fec5 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -67,6 +67,7 @@ struct testcase {
int gso_len; /* mss after applying gso */
int r_num_mss; /* recv(): number of calls of full mss */
int r_len_last; /* recv(): size of last non-mss dgram, if any */
+ bool v6_ext_hdr; /* send() dgrams with IPv6 extension headers */
};
const struct in6_addr addr6 = {
@@ -77,6 +78,8 @@ const struct in_addr addr4 = {
__constant_htonl(0x0a000001), /* 10.0.0.1 */
};
+static const char ipv6_hopopts_pad1[8] = { 0 };
+
struct testcase testcases_v4[] = {
{
/* no GSO: send a single byte */
@@ -256,6 +259,13 @@ struct testcase testcases_v6[] = {
.r_num_mss = 2,
},
{
+ /* send 2 1B segments with extension headers */
+ .tlen = 2,
+ .gso_len = 1,
+ .r_num_mss = 2,
+ .v6_ext_hdr = true,
+ },
+ {
/* send 2B + 2B + 1B segments */
.tlen = 5,
.gso_len = 2,
@@ -396,11 +406,18 @@ static void run_one(struct testcase *test, int fdt, int fdr,
int i, ret, val, mss;
bool sent;
- fprintf(stderr, "ipv%d tx:%d gso:%d %s\n",
+ fprintf(stderr, "ipv%d tx:%d gso:%d %s%s\n",
addr->sa_family == AF_INET ? 4 : 6,
test->tlen, test->gso_len,
+ test->v6_ext_hdr ? "ext-hdr " : "",
test->tfail ? "(fail)" : "");
+ if (test->v6_ext_hdr) {
+ if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS,
+ ipv6_hopopts_pad1, sizeof(ipv6_hopopts_pad1)))
+ error(1, errno, "setsockopt ipv6 hopopts");
+ }
+
val = test->gso_len;
if (cfg_do_setsockopt) {
if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val)))
@@ -412,6 +429,12 @@ static void run_one(struct testcase *test, int fdt, int fdr,
error(1, 0, "send succeeded while expecting failure");
if (!sent && !test->tfail)
error(1, 0, "send failed while expecting success");
+
+ if (test->v6_ext_hdr) {
+ if (setsockopt(fdt, IPPROTO_IPV6, IPV6_HOPOPTS, NULL, 0))
+ error(1, errno, "setsockopt ipv6 hopopts clear");
+ }
+
if (!sent)
return;
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 47746b0c6acd..7c2a4349170a 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -16,11 +16,56 @@
#include <unistd.h>
#include <sys/socket.h>
#include <sys/stat.h>
+#include <linux/ioctl.h>
#include "pidfd.h"
#include "../clone3/clone3_selftests.h"
#include "../kselftest_harness.h"
+#ifndef PIDFS_IOCTL_MAGIC
+#define PIDFS_IOCTL_MAGIC 0xFF
+#endif
+
+#ifndef PIDFD_GET_CGROUP_NAMESPACE
+#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1)
+#endif
+
+#ifndef PIDFD_GET_IPC_NAMESPACE
+#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2)
+#endif
+
+#ifndef PIDFD_GET_MNT_NAMESPACE
+#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3)
+#endif
+
+#ifndef PIDFD_GET_NET_NAMESPACE
+#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4)
+#endif
+
+#ifndef PIDFD_GET_PID_NAMESPACE
+#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5)
+#endif
+
+#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE
+#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6)
+#endif
+
+#ifndef PIDFD_GET_TIME_NAMESPACE
+#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7)
+#endif
+
+#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE
+#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8)
+#endif
+
+#ifndef PIDFD_GET_USER_NAMESPACE
+#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9)
+#endif
+
+#ifndef PIDFD_GET_UTS_NAMESPACE
+#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10)
+#endif
+
enum {
PIDFD_NS_USER,
PIDFD_NS_MNT,
@@ -31,22 +76,25 @@ enum {
PIDFD_NS_CGROUP,
PIDFD_NS_PIDCLD,
PIDFD_NS_TIME,
+ PIDFD_NS_TIMECLD,
PIDFD_NS_MAX
};
const struct ns_info {
const char *name;
int flag;
+ unsigned int pidfd_ioctl;
} ns_info[] = {
- [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, },
- [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, },
- [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, },
- [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, },
- [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, },
- [PIDFD_NS_NET] = { "net", CLONE_NEWNET, },
- [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, },
- [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, },
- [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, },
+ [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, PIDFD_GET_USER_NAMESPACE, },
+ [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, PIDFD_GET_MNT_NAMESPACE, },
+ [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, PIDFD_GET_PID_NAMESPACE, },
+ [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, PIDFD_GET_UTS_NAMESPACE, },
+ [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, PIDFD_GET_IPC_NAMESPACE, },
+ [PIDFD_NS_NET] = { "net", CLONE_NEWNET, PIDFD_GET_NET_NAMESPACE, },
+ [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, PIDFD_GET_CGROUP_NAMESPACE, },
+ [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, PIDFD_GET_TIME_NAMESPACE, },
+ [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE, },
+ [PIDFD_NS_TIMECLD] = { "time_for_children", 0, PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE, },
};
FIXTURE(current_nsset)
@@ -54,6 +102,7 @@ FIXTURE(current_nsset)
pid_t pid;
int pidfd;
int nsfds[PIDFD_NS_MAX];
+ int child_pidfd_derived_nsfds[PIDFD_NS_MAX];
pid_t child_pid_exited;
int child_pidfd_exited;
@@ -61,10 +110,12 @@ FIXTURE(current_nsset)
pid_t child_pid1;
int child_pidfd1;
int child_nsfds1[PIDFD_NS_MAX];
+ int child_pidfd_derived_nsfds1[PIDFD_NS_MAX];
pid_t child_pid2;
int child_pidfd2;
int child_nsfds2[PIDFD_NS_MAX];
+ int child_pidfd_derived_nsfds2[PIDFD_NS_MAX];
};
static int sys_waitid(int which, pid_t pid, int options)
@@ -128,9 +179,12 @@ FIXTURE_SETUP(current_nsset)
char c;
for (i = 0; i < PIDFD_NS_MAX; i++) {
- self->nsfds[i] = -EBADF;
- self->child_nsfds1[i] = -EBADF;
- self->child_nsfds2[i] = -EBADF;
+ self->nsfds[i] = -EBADF;
+ self->child_nsfds1[i] = -EBADF;
+ self->child_nsfds2[i] = -EBADF;
+ self->child_pidfd_derived_nsfds[i] = -EBADF;
+ self->child_pidfd_derived_nsfds1[i] = -EBADF;
+ self->child_pidfd_derived_nsfds2[i] = -EBADF;
}
proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
@@ -139,6 +193,11 @@ FIXTURE_SETUP(current_nsset)
}
self->pid = getpid();
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ EXPECT_GT(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
for (i = 0; i < PIDFD_NS_MAX; i++) {
const struct ns_info *info = &ns_info[i];
self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
@@ -148,20 +207,27 @@ FIXTURE_SETUP(current_nsset)
info->name, self->pid);
}
}
- }
- self->pidfd = sys_pidfd_open(self->pid, 0);
- EXPECT_GT(self->pidfd, 0) {
- TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ self->child_pidfd_derived_nsfds[i] = ioctl(self->pidfd, info->pidfd_ioctl, 0);
+ if (self->child_pidfd_derived_nsfds[i] < 0) {
+ EXPECT_EQ(errno, EOPNOTSUPP) {
+ TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d",
+ info->name, self->pid);
+ }
+ }
}
/* Create task that exits right away. */
- self->child_pid_exited = create_child(&self->child_pidfd_exited,
- CLONE_NEWUSER | CLONE_NEWNET);
+ self->child_pid_exited = create_child(&self->child_pidfd_exited, 0);
EXPECT_GE(self->child_pid_exited, 0);
- if (self->child_pid_exited == 0)
+ if (self->child_pid_exited == 0) {
+ if (self->nsfds[PIDFD_NS_USER] >= 0 && unshare(CLONE_NEWUSER) < 0)
+ _exit(EXIT_FAILURE);
+ if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0)
+ _exit(EXIT_FAILURE);
_exit(EXIT_SUCCESS);
+ }
ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
@@ -174,18 +240,43 @@ FIXTURE_SETUP(current_nsset)
EXPECT_EQ(ret, 0);
/* Create tasks that will be stopped. */
- self->child_pid1 = create_child(&self->child_pidfd1,
- CLONE_NEWUSER | CLONE_NEWNS |
- CLONE_NEWCGROUP | CLONE_NEWIPC |
- CLONE_NEWUTS | CLONE_NEWPID |
- CLONE_NEWNET);
+ if (self->nsfds[PIDFD_NS_USER] >= 0 && self->nsfds[PIDFD_NS_PID] >= 0)
+ self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER | CLONE_NEWPID);
+ else if (self->nsfds[PIDFD_NS_PID] >= 0)
+ self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWPID);
+ else if (self->nsfds[PIDFD_NS_USER] >= 0)
+ self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER);
+ else
+ self->child_pid1 = create_child(&self->child_pidfd1, 0);
EXPECT_GE(self->child_pid1, 0);
if (self->child_pid1 == 0) {
close(ipc_sockets[0]);
- if (!switch_timens())
+ if (self->nsfds[PIDFD_NS_MNT] >= 0 && unshare(CLONE_NEWNS) < 0) {
+ TH_LOG("%m - Failed to unshare mount namespace for process %d", self->pid);
_exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_CGROUP] >= 0 && unshare(CLONE_NEWCGROUP) < 0) {
+ TH_LOG("%m - Failed to unshare cgroup namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_IPC] >= 0 && unshare(CLONE_NEWIPC) < 0) {
+ TH_LOG("%m - Failed to unshare ipc namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_UTS] >= 0 && unshare(CLONE_NEWUTS) < 0) {
+ TH_LOG("%m - Failed to unshare uts namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) {
+ TH_LOG("%m - Failed to unshare net namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_TIME] >= 0 && !switch_timens()) {
+ TH_LOG("%m - Failed to unshare time namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
@@ -203,18 +294,43 @@ FIXTURE_SETUP(current_nsset)
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
EXPECT_EQ(ret, 0);
- self->child_pid2 = create_child(&self->child_pidfd2,
- CLONE_NEWUSER | CLONE_NEWNS |
- CLONE_NEWCGROUP | CLONE_NEWIPC |
- CLONE_NEWUTS | CLONE_NEWPID |
- CLONE_NEWNET);
+ if (self->nsfds[PIDFD_NS_USER] >= 0 && self->nsfds[PIDFD_NS_PID] >= 0)
+ self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID);
+ else if (self->nsfds[PIDFD_NS_PID] >= 0)
+ self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWPID);
+ else if (self->nsfds[PIDFD_NS_USER] >= 0)
+ self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER);
+ else
+ self->child_pid2 = create_child(&self->child_pidfd2, 0);
EXPECT_GE(self->child_pid2, 0);
if (self->child_pid2 == 0) {
close(ipc_sockets[0]);
- if (!switch_timens())
+ if (self->nsfds[PIDFD_NS_MNT] >= 0 && unshare(CLONE_NEWNS) < 0) {
+ TH_LOG("%m - Failed to unshare mount namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_CGROUP] >= 0 && unshare(CLONE_NEWCGROUP) < 0) {
+ TH_LOG("%m - Failed to unshare cgroup namespace for process %d", self->pid);
_exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_IPC] >= 0 && unshare(CLONE_NEWIPC) < 0) {
+ TH_LOG("%m - Failed to unshare ipc namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_UTS] >= 0 && unshare(CLONE_NEWUTS) < 0) {
+ TH_LOG("%m - Failed to unshare uts namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) {
+ TH_LOG("%m - Failed to unshare net namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
+ if (self->nsfds[PIDFD_NS_TIME] >= 0 && !switch_timens()) {
+ TH_LOG("%m - Failed to unshare time namespace for process %d", self->pid);
+ _exit(EXIT_FAILURE);
+ }
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
@@ -267,6 +383,22 @@ FIXTURE_SETUP(current_nsset)
info->name, self->child_pid1);
}
}
+
+ self->child_pidfd_derived_nsfds1[i] = ioctl(self->child_pidfd1, info->pidfd_ioctl, 0);
+ if (self->child_pidfd_derived_nsfds1[i] < 0) {
+ EXPECT_EQ(errno, EOPNOTSUPP) {
+ TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d",
+ info->name, self->child_pid1);
+ }
+ }
+
+ self->child_pidfd_derived_nsfds2[i] = ioctl(self->child_pidfd2, info->pidfd_ioctl, 0);
+ if (self->child_pidfd_derived_nsfds2[i] < 0) {
+ EXPECT_EQ(errno, EOPNOTSUPP) {
+ TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d",
+ info->name, self->child_pid2);
+ }
+ }
}
close(proc_fd);
@@ -288,6 +420,12 @@ FIXTURE_TEARDOWN(current_nsset)
close(self->child_nsfds1[i]);
if (self->child_nsfds2[i] >= 0)
close(self->child_nsfds2[i]);
+ if (self->child_pidfd_derived_nsfds[i] >= 0)
+ close(self->child_pidfd_derived_nsfds[i]);
+ if (self->child_pidfd_derived_nsfds1[i] >= 0)
+ close(self->child_pidfd_derived_nsfds1[i]);
+ if (self->child_pidfd_derived_nsfds2[i] >= 0)
+ close(self->child_pidfd_derived_nsfds2[i]);
}
if (self->child_pidfd1 >= 0)
@@ -446,6 +584,42 @@ TEST_F(current_nsset, nsfd_incremental_setns)
}
}
+TEST_F(current_nsset, pidfd_derived_nsfd_incremental_setns)
+{
+ int i;
+ pid_t pid;
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_pidfd_derived_nsfds1[i] < 0)
+ continue;
+
+ if (info->flag) {
+ ASSERT_EQ(setns(self->child_pidfd_derived_nsfds1[i], info->flag), 0) {
+ TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1,
+ self->child_pidfd_derived_nsfds1[i]);
+ }
+ }
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->child_pidfd_derived_nsfds[i];
+ else
+ nsfd = self->child_pidfd_derived_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1,
+ self->child_pidfd_derived_nsfds1[i]);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1, self->child_pidfd_derived_nsfds1[i]);
+ }
+}
+
TEST_F(current_nsset, pidfd_one_shot_setns)
{
unsigned flags = 0;
@@ -542,6 +716,28 @@ TEST_F(current_nsset, no_foul_play)
info->name, self->child_pid2,
self->child_nsfds2[i]);
}
+
+ /*
+ * Can't setns to a user namespace outside of our hierarchy since we
+ * don't have caps in there and didn't create it. That means that under
+ * no circumstances should we be able to setns to any of the other
+ * ones since they aren't owned by our user namespace.
+ */
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_pidfd_derived_nsfds2[i] < 0 || !info->flag)
+ continue;
+
+ ASSERT_NE(setns(self->child_pidfd_derived_nsfds2[i], info->flag), 0) {
+ TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid2,
+ self->child_pidfd_derived_nsfds2[i]);
+ }
+ TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid2,
+ self->child_pidfd_derived_nsfds2[i]);
+ }
}
TEST(setns_einval)
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index a156ac5dd2c6..973968f45bba 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -2,6 +2,7 @@
/fd-001-lookup
/fd-002-posix-eq
/fd-003-kthread
+/proc-2-is-kthread
/proc-fsconfig-hidepid
/proc-loadavg-001
/proc-multiple-procfs
@@ -9,6 +10,7 @@
/proc-pid-vm
/proc-self-map-files-001
/proc-self-map-files-002
+/proc-self-isnt-kthread
/proc-self-syscall
/proc-self-wchan
/proc-subset-pid
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index cd95369254c0..b12921b9794b 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -1,17 +1,19 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O2 -Wno-unused-function
-CFLAGS += -D_GNU_SOURCE
+CFLAGS += $(TOOLS_INCLUDES)
LDFLAGS += -pthread
TEST_GEN_PROGS :=
TEST_GEN_PROGS += fd-001-lookup
TEST_GEN_PROGS += fd-002-posix-eq
TEST_GEN_PROGS += fd-003-kthread
+TEST_GEN_PROGS += proc-2-is-kthread
TEST_GEN_PROGS += proc-loadavg-001
TEST_GEN_PROGS += proc-empty-vm
TEST_GEN_PROGS += proc-pid-vm
TEST_GEN_PROGS += proc-self-map-files-001
TEST_GEN_PROGS += proc-self-map-files-002
+TEST_GEN_PROGS += proc-self-isnt-kthread
TEST_GEN_PROGS += proc-self-syscall
TEST_GEN_PROGS += proc-self-wchan
TEST_GEN_PROGS += proc-subset-pid
diff --git a/tools/testing/selftests/proc/proc-2-is-kthread.c b/tools/testing/selftests/proc/proc-2-is-kthread.c
new file mode 100644
index 000000000000..f13668fb482e
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-2-is-kthread.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that kernel thread is reported as such. */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(void)
+{
+ /*
+ * The following solutions don't really work:
+ *
+ * 1) jit kernel module which creates kernel thread:
+ * test becomes arch-specific,
+ * problems with mandatory module signing,
+ * problems with lockdown mode,
+ * doesn't work with CONFIG_MODULES=n at all,
+ * kthread creation API is formally unstable internal kernel API,
+ * need a mechanism to report test kernel thread's PID back,
+ *
+ * 2) ksoftirqd/0 and kswapd0 look like stable enough kernel threads,
+ * but their PIDs are unstable.
+ *
+ * Check against kthreadd which always seem to exist under pid 2.
+ */
+ int fd = open("/proc/2/status", O_RDONLY);
+ assert(fd >= 0);
+
+ char buf[4096];
+ ssize_t rv = read(fd, buf, sizeof(buf));
+ assert(0 <= rv && rv < sizeof(buf));
+ buf[rv] = '\0';
+
+ assert(strstr(buf, "Kthread:\t1\n"));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c
index 56198d4ca2bf..b3f898aab4ab 100644
--- a/tools/testing/selftests/proc/proc-empty-vm.c
+++ b/tools/testing/selftests/proc/proc-empty-vm.c
@@ -381,9 +381,6 @@ static int test_proc_pid_statm(pid_t pid)
assert(rv >= 0);
assert(rv <= sizeof(buf));
- if (0) {
- write(1, buf, rv);
- }
const char *p = buf;
const char *const end = p + rv;
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
index cacbd2a4aec9..d04685771952 100644
--- a/tools/testing/selftests/proc/proc-pid-vm.c
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -45,6 +45,7 @@
#include <linux/kdev_t.h>
#include <sys/time.h>
#include <sys/resource.h>
+#include <linux/fs.h>
#include "../kselftest.h"
@@ -492,6 +493,91 @@ int main(void)
assert(buf[13] == '\n');
}
+ /* Test PROCMAP_QUERY ioctl() for /proc/$PID/maps */
+ {
+ char path_buf[256], exp_path_buf[256];
+ struct procmap_query q;
+ int fd, err;
+
+ snprintf(path_buf, sizeof(path_buf), "/proc/%u/maps", pid);
+ fd = open(path_buf, O_RDONLY);
+ if (fd == -1)
+ return 1;
+
+ /* CASE 1: exact MATCH at VADDR */
+ memset(&q, 0, sizeof(q));
+ q.size = sizeof(q);
+ q.query_addr = VADDR;
+ q.query_flags = 0;
+ q.vma_name_addr = (__u64)(unsigned long)path_buf;
+ q.vma_name_size = sizeof(path_buf);
+
+ err = ioctl(fd, PROCMAP_QUERY, &q);
+ assert(err == 0);
+
+ assert(q.query_addr == VADDR);
+ assert(q.query_flags == 0);
+
+ assert(q.vma_flags == (PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_EXECUTABLE));
+ assert(q.vma_start == VADDR);
+ assert(q.vma_end == VADDR + PAGE_SIZE);
+ assert(q.vma_page_size == PAGE_SIZE);
+
+ assert(q.vma_offset == 0);
+ assert(q.inode == st.st_ino);
+ assert(q.dev_major == MAJOR(st.st_dev));
+ assert(q.dev_minor == MINOR(st.st_dev));
+
+ snprintf(exp_path_buf, sizeof(exp_path_buf),
+ "/tmp/#%llu (deleted)", (unsigned long long)st.st_ino);
+ assert(q.vma_name_size == strlen(exp_path_buf) + 1);
+ assert(strcmp(path_buf, exp_path_buf) == 0);
+
+ /* CASE 2: NO MATCH at VADDR-1 */
+ memset(&q, 0, sizeof(q));
+ q.size = sizeof(q);
+ q.query_addr = VADDR - 1;
+ q.query_flags = 0; /* exact match */
+
+ err = ioctl(fd, PROCMAP_QUERY, &q);
+ err = err < 0 ? -errno : 0;
+ assert(err == -ENOENT);
+
+ /* CASE 3: MATCH COVERING_OR_NEXT_VMA at VADDR - 1 */
+ memset(&q, 0, sizeof(q));
+ q.size = sizeof(q);
+ q.query_addr = VADDR - 1;
+ q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA;
+
+ err = ioctl(fd, PROCMAP_QUERY, &q);
+ assert(err == 0);
+
+ assert(q.query_addr == VADDR - 1);
+ assert(q.query_flags == PROCMAP_QUERY_COVERING_OR_NEXT_VMA);
+ assert(q.vma_start == VADDR);
+ assert(q.vma_end == VADDR + PAGE_SIZE);
+
+ /* CASE 4: NO MATCH at VADDR + PAGE_SIZE */
+ memset(&q, 0, sizeof(q));
+ q.size = sizeof(q);
+ q.query_addr = VADDR + PAGE_SIZE; /* point right after the VMA */
+ q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA;
+
+ err = ioctl(fd, PROCMAP_QUERY, &q);
+ err = err < 0 ? -errno : 0;
+ assert(err == -ENOENT);
+
+ /* CASE 5: NO MATCH WRITABLE at VADDR */
+ memset(&q, 0, sizeof(q));
+ q.size = sizeof(q);
+ q.query_addr = VADDR;
+ q.query_flags = PROCMAP_QUERY_VMA_WRITABLE;
+
+ err = ioctl(fd, PROCMAP_QUERY, &q);
+ err = err < 0 ? -errno : 0;
+ assert(err == -ENOENT);
+ }
+
return 0;
}
#else
diff --git a/tools/testing/selftests/proc/proc-self-isnt-kthread.c b/tools/testing/selftests/proc/proc-self-isnt-kthread.c
new file mode 100644
index 000000000000..e01f4e0a91b4
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-self-isnt-kthread.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that userspace program is not kernel thread. */
+#undef NDEBUG
+#include <assert.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(void)
+{
+ int fd = open("/proc/self/status", O_RDONLY);
+ assert(fd >= 0);
+
+ char buf[4096];
+ ssize_t rv = read(fd, buf, sizeof(buf));
+ assert(0 <= rv && rv < sizeof(buf));
+ buf[rv] = '\0';
+
+ /* This test is very much not kernel thread. */
+ assert(strstr(buf, "Kthread:\t0\n"));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
index 021863f86053..f408bd6bfc3d 100644
--- a/tools/testing/selftests/resctrl/Makefile
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2 -D_GNU_SOURCE
+CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2
CFLAGS += $(KHDR_INCLUDES)
TEST_GEN_PROGS := resctrl_tests
diff --git a/tools/testing/selftests/ring-buffer/Makefile b/tools/testing/selftests/ring-buffer/Makefile
index 627c5fa6d1ab..23605782639e 100644
--- a/tools/testing/selftests/ring-buffer/Makefile
+++ b/tools/testing/selftests/ring-buffer/Makefile
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -Wl,-no-as-needed -Wall
CFLAGS += $(KHDR_INCLUDES)
-CFLAGS += -D_GNU_SOURCE
TEST_GEN_PROGS = map_test
diff --git a/tools/testing/selftests/riscv/mm/Makefile b/tools/testing/selftests/riscv/mm/Makefile
index c333263f2b27..4664ed79e20b 100644
--- a/tools/testing/selftests/riscv/mm/Makefile
+++ b/tools/testing/selftests/riscv/mm/Makefile
@@ -3,7 +3,7 @@
# Originally tools/testing/arm64/abi/Makefile
# Additional include paths needed by kselftest.h and local headers
-CFLAGS += -D_GNU_SOURCE -std=gnu99 -I.
+CFLAGS += -std=gnu99 -I.
TEST_GEN_FILES := mmap_default mmap_bottomup
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index e3f97f90d8db..8c3a73461475 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -60,7 +60,9 @@
#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
#endif
+#ifndef MIN
#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+#endif
#ifndef PR_SET_PTRACER
# define PR_SET_PTRACER 0x59616d61
diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile
index 867f88ce2570..03b5e13b872b 100644
--- a/tools/testing/selftests/sgx/Makefile
+++ b/tools/testing/selftests/sgx/Makefile
@@ -12,7 +12,7 @@ OBJCOPY := $(CROSS_COMPILE)objcopy
endif
INCLUDES := -I$(top_srcdir)/tools/include
-HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC
+HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC $(CFLAGS)
HOST_LDFLAGS := -z noexecstack -lcrypto
ENCL_CFLAGS += -Wall -Werror -static-pie -nostdlib -ffreestanding -fPIE \
-fno-stack-protector -mrdrnd $(INCLUDES)
diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py
index ee349187636f..4f255cec0c22 100755
--- a/tools/testing/selftests/tc-testing/tdc.py
+++ b/tools/testing/selftests/tc-testing/tdc.py
@@ -143,7 +143,6 @@ class PluginMgr:
except Exception as ee:
print('exception {} in call to pre_case for {} plugin'.
format(ee, pgn_inst.__class__))
- print('test_ordinal is {}'.format(test_ordinal))
print('testid is {}'.format(caseinfo['id']))
raise
diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile
index aa11ccc92e5b..3be931e1193f 100644
--- a/tools/testing/selftests/tmpfs/Makefile
+++ b/tools/testing/selftests/tmpfs/Makefile
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O2
-CFLAGS += -D_GNU_SOURCE
TEST_GEN_PROGS :=
TEST_GEN_PROGS += bug-link-o-tmpfile
diff --git a/tools/testing/selftests/turbostat/added_perf_counters.py b/tools/testing/selftests/turbostat/added_perf_counters.py
new file mode 100755
index 000000000000..9ab4aaf45fb8
--- /dev/null
+++ b/tools/testing/selftests/turbostat/added_perf_counters.py
@@ -0,0 +1,178 @@
+#!/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+from shutil import which
+from os import pread
+
+class PerfCounterInfo:
+ def __init__(self, subsys, event):
+ self.subsys = subsys
+ self.event = event
+
+ def get_perf_event_name(self):
+ return f'{self.subsys}/{self.event}/'
+
+ def get_turbostat_perf_id(self, counter_scope, counter_type, column_name):
+ return f'perf/{self.subsys}/{self.event},{counter_scope},{counter_type},{column_name}'
+
+PERF_COUNTERS_CANDIDATES = [
+ PerfCounterInfo('msr', 'mperf'),
+ PerfCounterInfo('msr', 'aperf'),
+ PerfCounterInfo('msr', 'tsc'),
+ PerfCounterInfo('cstate_core', 'c1-residency'),
+ PerfCounterInfo('cstate_core', 'c6-residency'),
+ PerfCounterInfo('cstate_core', 'c7-residency'),
+ PerfCounterInfo('cstate_pkg', 'c2-residency'),
+ PerfCounterInfo('cstate_pkg', 'c3-residency'),
+ PerfCounterInfo('cstate_pkg', 'c6-residency'),
+ PerfCounterInfo('cstate_pkg', 'c7-residency'),
+ PerfCounterInfo('cstate_pkg', 'c8-residency'),
+ PerfCounterInfo('cstate_pkg', 'c9-residency'),
+ PerfCounterInfo('cstate_pkg', 'c10-residency'),
+]
+present_perf_counters = []
+
+def check_perf_access():
+ perf = which('perf')
+ if perf is None:
+ print('SKIP: Could not find perf binary, thus could not determine perf access.')
+ return False
+
+ def has_perf_counter_access(counter_name):
+ proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'],
+ capture_output = True)
+
+ if proc_perf.returncode != 0:
+ print(f'SKIP: Could not read {counter_name} perf counter.')
+ return False
+
+ if b'<not supported>' in proc_perf.stderr:
+ print(f'SKIP: Could not read {counter_name} perf counter.')
+ return False
+
+ return True
+
+ for counter in PERF_COUNTERS_CANDIDATES:
+ if has_perf_counter_access(counter.get_perf_event_name()):
+ present_perf_counters.append(counter)
+
+ if len(present_perf_counters) == 0:
+ print('SKIP: Could not read any perf counter.')
+ return False
+
+ if len(present_perf_counters) != len(PERF_COUNTERS_CANDIDATES):
+ print(f'WARN: Could not access all of the counters - some will be left untested')
+
+ return True
+
+if not check_perf_access():
+ exit(0)
+
+turbostat_counter_source_opts = ['']
+
+turbostat = which('turbostat')
+if turbostat is None:
+ print('Could not find turbostat binary')
+ exit(1)
+
+timeout = which('timeout')
+if timeout is None:
+ print('Could not find timeout binary')
+ exit(1)
+
+proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True)
+if proc_turbostat.returncode != 0:
+ print(f'turbostat failed with {proc_turbostat.returncode}')
+ exit(1)
+
+EXPECTED_COLUMNS_DEBUG_DEFAULT = [b'usec', b'Time_Of_Day_Seconds', b'APIC', b'X2APIC']
+
+expected_columns = [b'CPU']
+counters_argv = []
+for counter in present_perf_counters:
+ if counter.subsys == 'cstate_core':
+ counter_scope = 'core'
+ elif counter.subsys == 'cstate_pkg':
+ counter_scope = 'package'
+ else:
+ counter_scope = 'cpu'
+
+ counter_type = 'delta'
+ column_name = counter.event
+
+ cparams = counter.get_turbostat_perf_id(
+ counter_scope = counter_scope,
+ counter_type = counter_type,
+ column_name = column_name
+ )
+ expected_columns.append(column_name.encode())
+ counters_argv.extend(['--add', cparams])
+
+expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + expected_columns
+
+def gen_user_friendly_cmdline(argv_):
+ argv = argv_[:]
+ ret = ''
+
+ while len(argv) != 0:
+ arg = argv.pop(0)
+ arg_next = ''
+
+ if arg in ('-i', '--show', '--add'):
+ arg_next = argv.pop(0) if len(argv) > 0 else ''
+
+ ret += f'{arg} {arg_next} \\\n\t'
+
+ # Remove the last separator and return
+ return ret[:-4]
+
+#
+# Run turbostat for some time and send SIGINT
+#
+timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s']
+turbostat_argv = [turbostat, '-i', '0.50', '--show', 'CPU'] + counters_argv
+
+def check_columns_or_fail(expected_columns: list, actual_columns: list):
+ if len(actual_columns) != len(expected_columns):
+ print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}')
+ exit(1)
+
+ failed = False
+ for expected_column in expected_columns:
+ if expected_column not in actual_columns:
+ print(f'turbostat column check failed: missing column {expected_column.decode()}')
+ failed = True
+
+ if failed:
+ exit(1)
+
+cmdline = gen_user_friendly_cmdline(turbostat_argv)
+print(f'Running turbostat with:\n\t{cmdline}\n... ', end = '', flush = True)
+proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+if proc_turbostat.returncode != 0:
+ print(f'turbostat failed with {proc_turbostat.returncode}')
+ exit(1)
+
+actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t')
+check_columns_or_fail(expected_columns, actual_columns)
+print('OK')
+
+#
+# Same, but with --debug
+#
+# We explicitly specify '--show CPU' to make sure turbostat
+# don't show a bunch of default counters instead.
+#
+turbostat_argv.append('--debug')
+
+cmdline = gen_user_friendly_cmdline(turbostat_argv)
+print(f'Running turbostat (in debug mode) with:\n\t{cmdline}\n... ', end = '', flush = True)
+proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+if proc_turbostat.returncode != 0:
+ print(f'turbostat failed with {proc_turbostat.returncode}')
+ exit(1)
+
+actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t')
+check_columns_or_fail(expected_columns_debug, actual_columns)
+print('OK')
diff --git a/tools/testing/selftests/turbostat/smi_aperf_mperf.py b/tools/testing/selftests/turbostat/smi_aperf_mperf.py
new file mode 100755
index 000000000000..6289cc47d5f0
--- /dev/null
+++ b/tools/testing/selftests/turbostat/smi_aperf_mperf.py
@@ -0,0 +1,157 @@
+#!/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+from shutil import which
+from os import pread
+
+# CDLL calls dlopen underneath.
+# Calling it with None (null), we get handle to the our own image (python interpreter).
+# We hope to find sched_getcpu() inside ;]
+# This is a bit ugly, but helps shipping working software, so..
+try:
+ import ctypes
+
+ this_image = ctypes.CDLL(None)
+ BASE_CPU = this_image.sched_getcpu()
+except:
+ BASE_CPU = 0 # If we fail, set to 0 and pray it's not offline.
+
+MSR_IA32_MPERF = 0x000000e7
+MSR_IA32_APERF = 0x000000e8
+
+def check_perf_access():
+ perf = which('perf')
+ if perf is None:
+ print('SKIP: Could not find perf binary, thus could not determine perf access.')
+ return False
+
+ def has_perf_counter_access(counter_name):
+ proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'],
+ capture_output = True)
+
+ if proc_perf.returncode != 0:
+ print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.')
+ return False
+
+ if b'<not supported>' in proc_perf.stderr:
+ print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.')
+ return False
+
+ return True
+
+ if not has_perf_counter_access('msr/mperf/'):
+ return False
+ if not has_perf_counter_access('msr/aperf/'):
+ return False
+ if not has_perf_counter_access('msr/smi/'):
+ return False
+
+ return True
+
+def check_msr_access():
+ try:
+ file_msr = open(f'/dev/cpu/{BASE_CPU}/msr', 'rb')
+ except:
+ return False
+
+ if len(pread(file_msr.fileno(), 8, MSR_IA32_MPERF)) != 8:
+ return False
+
+ if len(pread(file_msr.fileno(), 8, MSR_IA32_APERF)) != 8:
+ return False
+
+ return True
+
+has_perf_access = check_perf_access()
+has_msr_access = check_msr_access()
+
+turbostat_counter_source_opts = ['']
+
+if has_msr_access:
+ turbostat_counter_source_opts.append('--no-perf')
+else:
+ print('SKIP: doesn\'t have MSR access, skipping run with --no-perf')
+
+if has_perf_access:
+ turbostat_counter_source_opts.append('--no-msr')
+else:
+ print('SKIP: doesn\'t have perf access, skipping run with --no-msr')
+
+if not has_msr_access and not has_perf_access:
+ print('SKIP: No MSR nor perf access detected. Skipping the tests entirely')
+ exit(0)
+
+turbostat = which('turbostat')
+if turbostat is None:
+ print('Could not find turbostat binary')
+ exit(1)
+
+timeout = which('timeout')
+if timeout is None:
+ print('Could not find timeout binary')
+ exit(1)
+
+proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True)
+if proc_turbostat.returncode != 0:
+ print(f'turbostat failed with {proc_turbostat.returncode}')
+ exit(1)
+
+EXPECTED_COLUMNS_DEBUG_DEFAULT = b'usec\tTime_Of_Day_Seconds\tAPIC\tX2APIC'
+
+SMI_APERF_MPERF_DEPENDENT_BICS = [
+ 'SMI',
+ 'Avg_MHz',
+ 'Busy%',
+ 'Bzy_MHz',
+]
+if has_perf_access:
+ SMI_APERF_MPERF_DEPENDENT_BICS.append('IPC')
+
+for bic in SMI_APERF_MPERF_DEPENDENT_BICS:
+ for counter_source_opt in turbostat_counter_source_opts:
+
+ # Ugly special case, but it is what it is..
+ if counter_source_opt == '--no-perf' and bic == 'IPC':
+ continue
+
+ expected_columns = bic.encode()
+ expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + f'\t{bic}'.encode()
+
+ #
+ # Run turbostat for some time and send SIGINT
+ #
+ timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s']
+ turbostat_argv = [turbostat, '-i', '0.50', '--show', bic]
+
+ if counter_source_opt:
+ turbostat_argv.append(counter_source_opt)
+
+ print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True)
+ proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+ if proc_turbostat.returncode != 0:
+ print(f'turbostat failed with {proc_turbostat.returncode}')
+ exit(1)
+
+ actual_columns = proc_turbostat.stdout.split(b'\n')[0]
+ if expected_columns != actual_columns:
+ print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}')
+ exit(1)
+ print('OK')
+
+ #
+ # Same, but with --debug
+ #
+ turbostat_argv.append('--debug')
+
+ print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True)
+ proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+ if proc_turbostat.returncode != 0:
+ print(f'turbostat failed with {proc_turbostat.returncode}')
+ exit(1)
+
+ actual_columns = proc_turbostat.stdout.split(b'\n')[0]
+ if expected_columns_debug != actual_columns:
+ print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}')
+ exit(1)
+ print('OK')
diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore
index a8dc51af5a9c..30d5c8f0e5c7 100644
--- a/tools/testing/selftests/vDSO/.gitignore
+++ b/tools/testing/selftests/vDSO/.gitignore
@@ -6,3 +6,5 @@ vdso_test_correctness
vdso_test_gettimeofday
vdso_test_getcpu
vdso_standalone_test_x86
+vdso_test_getrandom
+vdso_test_chacha
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 98d8ba2afa00..3de8e7e052ae 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
uname_M := $(shell uname -m 2>/dev/null || echo not)
ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+SODIUM := $(shell pkg-config --libs libsodium 2>/dev/null)
TEST_GEN_PROGS := vdso_test_gettimeofday
TEST_GEN_PROGS += vdso_test_getcpu
@@ -10,6 +11,12 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
TEST_GEN_PROGS += vdso_standalone_test_x86
endif
TEST_GEN_PROGS += vdso_test_correctness
+ifeq ($(uname_M),x86_64)
+TEST_GEN_PROGS += vdso_test_getrandom
+ifneq ($(SODIUM),)
+TEST_GEN_PROGS += vdso_test_chacha
+endif
+endif
CFLAGS := -std=gnu99
@@ -28,3 +35,14 @@ $(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-
$(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c
$(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl
+
+$(OUTPUT)/vdso_test_getrandom: parse_vdso.c
+$(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
+ -isystem $(top_srcdir)/include/uapi
+
+$(OUTPUT)/vdso_test_chacha: $(top_srcdir)/arch/$(ARCH)/entry/vdso/vgetrandom-chacha.S
+$(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
+ -isystem $(top_srcdir)/arch/$(ARCH)/include \
+ -isystem $(top_srcdir)/include \
+ -D__ASSEMBLY__ -DBULID_VDSO -DCONFIG_FUNCTION_ALIGNMENT=0 \
+ -Wa,--noexecstack $(SODIUM)
diff --git a/tools/testing/selftests/vDSO/vdso_test_chacha.c b/tools/testing/selftests/vDSO/vdso_test_chacha.c
new file mode 100644
index 000000000000..e38f44e5f803
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_test_chacha.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <sodium/crypto_stream_chacha20.h>
+#include <sys/random.h>
+#include <string.h>
+#include <stdint.h>
+#include "../kselftest.h"
+
+extern void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, const uint8_t *key, uint32_t *counter, size_t nblocks);
+
+int main(int argc, char *argv[])
+{
+ enum { TRIALS = 1000, BLOCKS = 128, BLOCK_SIZE = 64 };
+ static const uint8_t nonce[8] = { 0 };
+ uint32_t counter[2];
+ uint8_t key[32];
+ uint8_t output1[BLOCK_SIZE * BLOCKS], output2[BLOCK_SIZE * BLOCKS];
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ for (unsigned int trial = 0; trial < TRIALS; ++trial) {
+ if (getrandom(key, sizeof(key), 0) != sizeof(key)) {
+ printf("getrandom() failed!\n");
+ return KSFT_SKIP;
+ }
+ crypto_stream_chacha20(output1, sizeof(output1), nonce, key);
+ for (unsigned int split = 0; split < BLOCKS; ++split) {
+ memset(output2, 'X', sizeof(output2));
+ memset(counter, 0, sizeof(counter));
+ if (split)
+ __arch_chacha20_blocks_nostack(output2, key, counter, split);
+ __arch_chacha20_blocks_nostack(output2 + split * BLOCK_SIZE, key, counter, BLOCKS - split);
+ if (memcmp(output1, output2, sizeof(output1)))
+ return KSFT_FAIL;
+ }
+ }
+ ksft_test_result_pass("chacha: PASS\n");
+ return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/vDSO/vdso_test_getrandom.c b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
new file mode 100644
index 000000000000..05122425a873
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_test_getrandom.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/random.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/random.h>
+
+#include "../kselftest.h"
+#include "parse_vdso.h"
+
+#ifndef timespecsub
+#define timespecsub(tsp, usp, vsp) \
+ do { \
+ (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \
+ (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \
+ if ((vsp)->tv_nsec < 0) { \
+ (vsp)->tv_sec--; \
+ (vsp)->tv_nsec += 1000000000L; \
+ } \
+ } while (0)
+#endif
+
+static struct {
+ pthread_mutex_t lock;
+ void **states;
+ size_t len, cap;
+} grnd_allocator = {
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct {
+ ssize_t(*fn)(void *, size_t, unsigned long, void *, size_t);
+ pthread_key_t key;
+ pthread_once_t initialized;
+ struct vgetrandom_opaque_params params;
+} grnd_ctx = {
+ .initialized = PTHREAD_ONCE_INIT
+};
+
+static void *vgetrandom_get_state(void)
+{
+ void *state = NULL;
+
+ pthread_mutex_lock(&grnd_allocator.lock);
+ if (!grnd_allocator.len) {
+ size_t page_size = getpagesize();
+ size_t new_cap;
+ size_t alloc_size, num = sysconf(_SC_NPROCESSORS_ONLN); /* Just a decent heuristic. */
+ void *new_block, *new_states;
+
+ alloc_size = (num * grnd_ctx.params.size_of_opaque_state + page_size - 1) & (~(page_size - 1));
+ num = (page_size / grnd_ctx.params.size_of_opaque_state) * (alloc_size / page_size);
+ new_block = mmap(0, alloc_size, grnd_ctx.params.mmap_prot, grnd_ctx.params.mmap_flags, -1, 0);
+ if (new_block == MAP_FAILED)
+ goto out;
+
+ new_cap = grnd_allocator.cap + num;
+ new_states = reallocarray(grnd_allocator.states, new_cap, sizeof(*grnd_allocator.states));
+ if (!new_states)
+ goto unmap;
+ grnd_allocator.cap = new_cap;
+ grnd_allocator.states = new_states;
+
+ for (size_t i = 0; i < num; ++i) {
+ if (((uintptr_t)new_block & (page_size - 1)) + grnd_ctx.params.size_of_opaque_state > page_size)
+ new_block = (void *)(((uintptr_t)new_block + page_size - 1) & (~(page_size - 1)));
+ grnd_allocator.states[i] = new_block;
+ new_block += grnd_ctx.params.size_of_opaque_state;
+ }
+ grnd_allocator.len = num;
+ goto success;
+
+ unmap:
+ munmap(new_block, alloc_size);
+ goto out;
+ }
+success:
+ state = grnd_allocator.states[--grnd_allocator.len];
+
+out:
+ pthread_mutex_unlock(&grnd_allocator.lock);
+ return state;
+}
+
+static void vgetrandom_put_state(void *state)
+{
+ if (!state)
+ return;
+ pthread_mutex_lock(&grnd_allocator.lock);
+ grnd_allocator.states[grnd_allocator.len++] = state;
+ pthread_mutex_unlock(&grnd_allocator.lock);
+}
+
+static void vgetrandom_init(void)
+{
+ if (pthread_key_create(&grnd_ctx.key, vgetrandom_put_state) != 0)
+ return;
+ unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
+ if (!sysinfo_ehdr) {
+ printf("AT_SYSINFO_EHDR is not present!\n");
+ exit(KSFT_SKIP);
+ }
+ vdso_init_from_sysinfo_ehdr(sysinfo_ehdr);
+ grnd_ctx.fn = (__typeof__(grnd_ctx.fn))vdso_sym("LINUX_2.6", "__vdso_getrandom");
+ if (!grnd_ctx.fn) {
+ printf("__vdso_getrandom is missing!\n");
+ exit(KSFT_FAIL);
+ }
+ if (grnd_ctx.fn(NULL, 0, 0, &grnd_ctx.params, ~0UL) != 0) {
+ printf("failed to fetch vgetrandom params!\n");
+ exit(KSFT_FAIL);
+ }
+}
+
+static ssize_t vgetrandom(void *buf, size_t len, unsigned long flags)
+{
+ void *state;
+
+ pthread_once(&grnd_ctx.initialized, vgetrandom_init);
+ state = pthread_getspecific(grnd_ctx.key);
+ if (!state) {
+ state = vgetrandom_get_state();
+ if (pthread_setspecific(grnd_ctx.key, state) != 0) {
+ vgetrandom_put_state(state);
+ state = NULL;
+ }
+ if (!state) {
+ printf("vgetrandom_get_state failed!\n");
+ exit(KSFT_FAIL);
+ }
+ }
+ return grnd_ctx.fn(buf, len, flags, state, grnd_ctx.params.size_of_opaque_state);
+}
+
+enum { TRIALS = 25000000, THREADS = 256 };
+
+static void *test_vdso_getrandom(void *)
+{
+ for (size_t i = 0; i < TRIALS; ++i) {
+ unsigned int val;
+ ssize_t ret = vgetrandom(&val, sizeof(val), 0);
+ assert(ret == sizeof(val));
+ }
+ return NULL;
+}
+
+static void *test_libc_getrandom(void *)
+{
+ for (size_t i = 0; i < TRIALS; ++i) {
+ unsigned int val;
+ ssize_t ret = getrandom(&val, sizeof(val), 0);
+ assert(ret == sizeof(val));
+ }
+ return NULL;
+}
+
+static void *test_syscall_getrandom(void *)
+{
+ for (size_t i = 0; i < TRIALS; ++i) {
+ unsigned int val;
+ ssize_t ret = syscall(__NR_getrandom, &val, sizeof(val), 0);
+ assert(ret == sizeof(val));
+ }
+ return NULL;
+}
+
+static void bench_single(void)
+{
+ struct timespec start, end, diff;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ test_vdso_getrandom(NULL);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ timespecsub(&end, &start, &diff);
+ printf(" vdso: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ test_libc_getrandom(NULL);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ timespecsub(&end, &start, &diff);
+ printf(" libc: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ test_syscall_getrandom(NULL);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ timespecsub(&end, &start, &diff);
+ printf("syscall: %u times in %lu.%09lu seconds\n", TRIALS, diff.tv_sec, diff.tv_nsec);
+}
+
+static void bench_multi(void)
+{
+ struct timespec start, end, diff;
+ pthread_t threads[THREADS];
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ for (size_t i = 0; i < THREADS; ++i)
+ assert(pthread_create(&threads[i], NULL, test_vdso_getrandom, NULL) == 0);
+ for (size_t i = 0; i < THREADS; ++i)
+ pthread_join(threads[i], NULL);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ timespecsub(&end, &start, &diff);
+ printf(" vdso: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ for (size_t i = 0; i < THREADS; ++i)
+ assert(pthread_create(&threads[i], NULL, test_libc_getrandom, NULL) == 0);
+ for (size_t i = 0; i < THREADS; ++i)
+ pthread_join(threads[i], NULL);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ timespecsub(&end, &start, &diff);
+ printf(" libc: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ for (size_t i = 0; i < THREADS; ++i)
+ assert(pthread_create(&threads[i], NULL, test_syscall_getrandom, NULL) == 0);
+ for (size_t i = 0; i < THREADS; ++i)
+ pthread_join(threads[i], NULL);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ timespecsub(&end, &start, &diff);
+ printf(" syscall: %u x %u times in %lu.%09lu seconds\n", TRIALS, THREADS, diff.tv_sec, diff.tv_nsec);
+}
+
+static void fill(void)
+{
+ uint8_t weird_size[323929];
+ for (;;)
+ vgetrandom(weird_size, sizeof(weird_size), 0);
+}
+
+static void kselftest(void)
+{
+ uint8_t weird_size[1263];
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ for (size_t i = 0; i < 1000; ++i) {
+ ssize_t ret = vgetrandom(weird_size, sizeof(weird_size), 0);
+ if (ret != sizeof(weird_size))
+ exit(KSFT_FAIL);
+ }
+
+ ksft_test_result_pass("getrandom: PASS\n");
+ exit(KSFT_PASS);
+}
+
+static void usage(const char *argv0)
+{
+ fprintf(stderr, "Usage: %s [bench-single|bench-multi|fill]\n", argv0);
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ kselftest();
+ return 0;
+ }
+
+ if (argc != 2) {
+ usage(argv[0]);
+ return 1;
+ }
+ if (!strcmp(argv[1], "bench-single"))
+ bench_single();
+ else if (!strcmp(argv[1], "bench-multi"))
+ bench_multi();
+ else if (!strcmp(argv[1], "fill"))
+ fill();
+ else {
+ usage(argv[0]);
+ return 1;
+ }
+ return 0;
+}
diff --git a/tools/tracing/latency/Makefile.config b/tools/tracing/latency/Makefile.config
index b25e531a1f95..0fe6b50f029b 100644
--- a/tools/tracing/latency/Makefile.config
+++ b/tools/tracing/latency/Makefile.config
@@ -3,8 +3,9 @@
STOP_ERROR :=
define lib_setup
- $(eval EXTLIBS += -l$(1))
$(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
+ $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)"))
+ $(eval EXTLIBS += $(shell sh -c "$(PKG_CONFIG) --libs-only-l lib$(1)"))
endef
$(call feature_check,libtraceevent)
diff --git a/tools/tracing/rtla/Makefile.config b/tools/tracing/rtla/Makefile.config
index 0b7ecfb30d19..5f8c286712d4 100644
--- a/tools/tracing/rtla/Makefile.config
+++ b/tools/tracing/rtla/Makefile.config
@@ -7,7 +7,8 @@ LIBTRACEFS_MIN_VERSION = 1.6
define lib_setup
$(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
- $(eval EXTLIBS += -l$(1))
+ $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)"))
+ $(eval EXTLIBS += $(shell sh -c "$(PKG_CONFIG) --libs-only-l lib$(1)"))
endef
$(call feature_check,libtraceevent)
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index f594a44df840..2f756628613d 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -651,8 +651,10 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params)
return NULL;
tool->data = osnoise_alloc_top(nr_cpus);
- if (!tool->data)
- goto out_err;
+ if (!tool->data) {
+ osnoise_destroy_tool(tool);
+ return NULL;
+ }
tool->params = params;
@@ -660,11 +662,6 @@ struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params)
osnoise_top_handler, NULL);
return tool;
-
-out_err:
- osnoise_free_top(tool->data);
- osnoise_destroy_tool(tool);
- return NULL;
}
static int stop_tracing;
diff --git a/tools/verification/rv/Makefile.config b/tools/verification/rv/Makefile.config
index 6d4ba77847b6..066302230eb2 100644
--- a/tools/verification/rv/Makefile.config
+++ b/tools/verification/rv/Makefile.config
@@ -7,7 +7,8 @@ LIBTRACEFS_MIN_VERSION = 1.3
define lib_setup
$(eval LIB_INCLUDES += $(shell sh -c "$(PKG_CONFIG) --cflags lib$(1)"))
- $(eval EXTLIBS += -l$(1))
+ $(eval LDFLAGS += $(shell sh -c "$(PKG_CONFIG) --libs-only-L lib$(1)"))
+ $(eval EXTLIBS += $(shell sh -c "$(PKG_CONFIG) --libs-only-l lib$(1)"))
endef
$(call feature_check,libtraceevent)