diff options
Diffstat (limited to 'tools')
162 files changed, 4452 insertions, 803 deletions
diff --git a/tools/arch/s390/include/uapi/asm/ptrace.h b/tools/arch/s390/include/uapi/asm/ptrace.h index 543dd70e12c8..ad64d673b5e6 100644 --- a/tools/arch/s390/include/uapi/asm/ptrace.h +++ b/tools/arch/s390/include/uapi/asm/ptrace.h @@ -179,8 +179,9 @@ #define ACR_SIZE 4 -#define PTRACE_OLDSETOPTIONS 21 - +#define PTRACE_OLDSETOPTIONS 21 +#define PTRACE_SYSEMU 31 +#define PTRACE_SYSEMU_SINGLESTEP 32 #ifndef __ASSEMBLY__ #include <linux/stddef.h> #include <linux/types.h> diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 84b887825f12..cc96e26d69f7 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +/* FREE! ( 7*32+20) */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -236,8 +236,6 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -294,6 +292,7 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ @@ -337,6 +336,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ @@ -385,6 +385,13 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 52c6262e6bfd..cc777c185212 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -7,9 +7,12 @@ * Copyright (C) IBM Corporation, 2009 */ +#include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ #include "inat.h" +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) + struct insn_field { union { insn_value_t value; @@ -20,6 +23,48 @@ struct insn_field { unsigned char nbytes; }; +static inline void insn_field_set(struct insn_field *p, insn_value_t v, + unsigned char n) +{ + p->value = v; + p->nbytes = n; +} + +static inline void insn_set_byte(struct insn_field *p, unsigned char n, + insn_byte_t v) +{ + p->bytes[n] = v; +} + +#else + +struct insn_field { + insn_value_t value; + union { + insn_value_t little; + insn_byte_t bytes[4]; + }; + /* !0 if we've run insn_get_xxx() for this field */ + unsigned char got; + unsigned char nbytes; +}; + +static inline void insn_field_set(struct insn_field *p, insn_value_t v, + unsigned char n) +{ + p->value = v; + p->little = __cpu_to_le32(v); + p->nbytes = n; +} + +static inline void insn_set_byte(struct insn_field *p, unsigned char n, + insn_byte_t v) +{ + p->bytes[n] = v; + p->value = __le32_to_cpu(p->little); +} +#endif + struct insn { struct insn_field prefixes; /* * Prefixes diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index fdbffec4cfde..5a2baf28a1dc 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -40,6 +40,8 @@ #define ORC_REG_MAX 15 #ifndef __ASSEMBLY__ +#include <asm/byteorder.h> + /* * This struct is more or less a vastly simplified version of the DWARF Call * Frame Information standard. It contains only the necessary parts of DWARF @@ -51,10 +53,18 @@ struct orc_entry { s16 sp_offset; s16 bp_offset; +#if defined(__LITTLE_ENDIAN_BITFIELD) unsigned sp_reg:4; unsigned bp_reg:4; unsigned type:2; unsigned end:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned bp_reg:4; + unsigned sp_reg:4; + unsigned unused:5; + unsigned end:1; + unsigned type:2; +#endif } __packed; #endif /* __ASSEMBLY__ */ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 8e76d3701db3..5a3022c8af82 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -112,6 +112,7 @@ struct kvm_ioapic_state { #define KVM_NR_IRQCHIPS 3 #define KVM_RUN_X86_SMM (1 << 0) +#define KVM_RUN_X86_BUS_LOCK (1 << 1) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index ada955c5ebb6..b8e650a985e3 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -89,6 +89,7 @@ #define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 +#define EXIT_REASON_BUS_LOCK 74 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -150,7 +151,8 @@ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ - { EXIT_REASON_TPAUSE, "TPAUSE" } + { EXIT_REASON_TPAUSE, "TPAUSE" }, \ + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c index 0151dfc6da61..3d9355ed1246 100644 --- a/tools/arch/x86/lib/insn.c +++ b/tools/arch/x86/lib/insn.c @@ -5,6 +5,7 @@ * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ +#include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else @@ -15,15 +16,28 @@ #include "../include/asm/emulate_prefix.h" +#define leXX_to_cpu(t, r) \ +({ \ + __typeof__(t) v; \ + switch (sizeof(t)) { \ + case 4: v = le32_to_cpu(r); break; \ + case 2: v = le16_to_cpu(r); break; \ + case 1: v = r; break; \ + default: \ + BUILD_BUG(); break; \ + } \ + v; \ +}) + /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); r; }) + ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) @@ -147,9 +161,9 @@ found: b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) - prefixes->bytes[i] = b; + insn_set_byte(prefixes, i, b); } - insn->prefixes.bytes[3] = lb; + insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ @@ -157,8 +171,7 @@ found: b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { - insn->rex_prefix.value = b; - insn->rex_prefix.nbytes = 1; + insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ @@ -181,13 +194,13 @@ found: if (X86_MODRM_MOD(b2) != 3) goto vex_end; } - insn->vex_prefix.bytes[0] = b; - insn->vex_prefix.bytes[1] = b2; + insn_set_byte(&insn->vex_prefix, 0, b); + insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); - insn->vex_prefix.bytes[2] = b2; + insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); - insn->vex_prefix.bytes[3] = b2; + insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) @@ -195,7 +208,7 @@ found: insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); - insn->vex_prefix.bytes[2] = b2; + insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) @@ -207,7 +220,7 @@ found: * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ - insn->vex_prefix.bytes[2] = b2 & 0x7f; + insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } @@ -243,7 +256,7 @@ void insn_get_opcode(struct insn *insn) /* Get first opcode */ op = get_next(insn_byte_t, insn); - opcode->bytes[0] = op; + insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX prefix or not */ @@ -295,8 +308,7 @@ void insn_get_modrm(struct insn *insn) if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); - modrm->value = mod; - modrm->nbytes = 1; + insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, @@ -334,7 +346,7 @@ int insn_rip_relative(struct insn *insn) * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ - return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); + return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** @@ -353,11 +365,11 @@ void insn_get_sib(struct insn *insn) if (!insn->modrm.got) insn_get_modrm(insn); if (insn->modrm.nbytes) { - modrm = (insn_byte_t)insn->modrm.value; + modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { - insn->sib.value = get_next(insn_byte_t, insn); - insn->sib.nbytes = 1; + insn_field_set(&insn->sib, + get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; @@ -407,19 +419,18 @@ void insn_get_displacement(struct insn *insn) if (mod == 3) goto out; if (mod == 1) { - insn->displacement.value = get_next(signed char, insn); - insn->displacement.nbytes = 1; + insn_field_set(&insn->displacement, + get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { - insn->displacement.value = - get_next(short, insn); - insn->displacement.nbytes = 2; + insn_field_set(&insn->displacement, + get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { - insn->displacement.value = get_next(int, insn); - insn->displacement.nbytes = 4; + insn_field_set(&insn->displacement, + get_next(int, insn), 4); } } } @@ -435,18 +446,14 @@ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: - insn->moffset1.value = get_next(short, insn); - insn->moffset1.nbytes = 2; + insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: - insn->moffset1.value = get_next(int, insn); - insn->moffset1.nbytes = 4; + insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: - insn->moffset1.value = get_next(int, insn); - insn->moffset1.nbytes = 4; - insn->moffset2.value = get_next(int, insn); - insn->moffset2.nbytes = 4; + insn_field_set(&insn->moffset1, get_next(int, insn), 4); + insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; @@ -464,13 +471,11 @@ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: - insn->immediate.value = get_next(short, insn); - insn->immediate.nbytes = 2; + insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: - insn->immediate.value = get_next(int, insn); - insn->immediate.nbytes = 4; + insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; @@ -487,18 +492,15 @@ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: - insn->immediate1.value = get_next(short, insn); - insn->immediate1.nbytes = 2; + insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: - insn->immediate1.value = get_next(int, insn); + insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: - insn->immediate1.value = get_next(int, insn); - insn->immediate1.nbytes = 4; - insn->immediate2.value = get_next(int, insn); - insn->immediate2.nbytes = 4; + insn_field_set(&insn->immediate1, get_next(int, insn), 4); + insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; @@ -515,12 +517,10 @@ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: - insn->immediate1.value = get_next(short, insn); - insn->immediate1.nbytes = 2; + insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: - insn->immediate1.value = get_next(int, insn); - insn->immediate1.nbytes = 4; + insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ @@ -528,8 +528,7 @@ static int __get_immptr(struct insn *insn) default: /* opnd_bytes must be modified manually */ goto err_out; } - insn->immediate2.value = get_next(unsigned short, insn); - insn->immediate2.nbytes = 2; + insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; @@ -565,22 +564,17 @@ void insn_get_immediate(struct insn *insn) switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: - insn->immediate.value = get_next(signed char, insn); - insn->immediate.nbytes = 1; + insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: - insn->immediate.value = get_next(short, insn); - insn->immediate.nbytes = 2; + insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: - insn->immediate.value = get_next(int, insn); - insn->immediate.nbytes = 4; + insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: - insn->immediate1.value = get_next(int, insn); - insn->immediate1.nbytes = 4; - insn->immediate2.value = get_next(int, insn); - insn->immediate2.nbytes = 4; + insn_field_set(&insn->immediate1, get_next(int, insn), 4); + insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) @@ -599,8 +593,7 @@ void insn_get_immediate(struct insn *insn) goto err_out; } if (inat_has_second_immediate(insn->attr)) { - insn->immediate2.value = get_next(signed char, insn); - insn->immediate2.nbytes = 1; + insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 7409d7860aa6..80d966cfcaa1 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -260,6 +260,11 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, false); } +/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ +#ifndef SHF_COMPRESSED +#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */ +#endif + /* * The data of compressed section should be aligned to 4 * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld diff --git a/tools/build/Makefile b/tools/build/Makefile index bae48e6fa995..5ed41b96fcde 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -30,12 +30,18 @@ build := -f $(srctree)/tools/build/Makefile.build dir=. obj all: $(OUTPUT)fixdep +# Make sure there's anything to clean, +# feature contains check for existing OUTPUT +TMP_O := $(if $(OUTPUT),$(OUTPUT)/feature,./) + clean: $(call QUIET_CLEAN, fixdep) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)rm -f $(OUTPUT)fixdep $(call QUIET_CLEAN, feature-detect) - $(Q)$(MAKE) -C feature/ clean >/dev/null +ifneq ($(wildcard $(TMP_O)),) + $(Q)$(MAKE) -C feature OUTPUT=$(TMP_O) clean >/dev/null +endif $(OUTPUT)fixdep-in.o: FORCE $(Q)$(MAKE) $(build)=fixdep diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h index b0e35eec6499..4ac5c081af93 100644 --- a/tools/include/linux/coresight-pmu.h +++ b/tools/include/linux/coresight-pmu.h @@ -10,17 +10,27 @@ #define CORESIGHT_ETM_PMU_NAME "cs_etm" #define CORESIGHT_ETM_PMU_SEED 0x10 -/* ETMv3.5/PTM's ETMCR config bit */ -#define ETM_OPT_CYCACC 12 -#define ETM_OPT_CTXTID 14 -#define ETM_OPT_TS 28 -#define ETM_OPT_RETSTK 29 +/* + * Below are the definition of bit offsets for perf option, and works as + * arbitrary values for all ETM versions. + * + * Most of them are orignally from ETMv3.5/PTM's ETMCR config, therefore, + * ETMv3.5/PTM doesn't define ETMCR config bits with prefix "ETM3_" and + * directly use below macros as config bits. + */ +#define ETM_OPT_CYCACC 12 +#define ETM_OPT_CTXTID 14 +#define ETM_OPT_CTXTID2 15 +#define ETM_OPT_TS 28 +#define ETM_OPT_RETSTK 29 /* ETMv4 CONFIGR programming bits for the ETM OPTs */ #define ETM4_CFG_BIT_CYCACC 4 #define ETM4_CFG_BIT_CTXTID 6 +#define ETM4_CFG_BIT_VMID 7 #define ETM4_CFG_BIT_TS 11 #define ETM4_CFG_BIT_RETSTK 12 +#define ETM4_CFG_BIT_VMID_OPT 15 static inline int coresight_get_trace_id(int cpu) { diff --git a/tools/include/linux/export.h b/tools/include/linux/export.h index d07e586b9ba0..acb6f4daa2f0 100644 --- a/tools/include/linux/export.h +++ b/tools/include/linux/export.h @@ -3,8 +3,5 @@ #define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL_GPL(sym) -#define EXPORT_SYMBOL_GPL_FUTURE(sym) -#define EXPORT_UNUSED_SYMBOL(sym) -#define EXPORT_UNUSED_SYMBOL_GPL(sym) #endif diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 577f51436cf9..7e72d975cb76 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -29,11 +29,14 @@ struct unwind_hint { * * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that * sp_reg+sp_offset points to the iret return frame. + * + * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function. + * Useful for code which doesn't have an ELF function annotation. */ #define UNWIND_HINT_TYPE_CALL 0 #define UNWIND_HINT_TYPE_REGS 1 #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 -#define UNWIND_HINT_TYPE_RET_OFFSET 3 +#define UNWIND_HINT_TYPE_FUNC 3 #ifdef CONFIG_STACK_VALIDATION @@ -109,6 +112,12 @@ struct unwind_hint { .popsection .endm +.macro STACK_FRAME_NON_STANDARD func:req + .pushsection .discard.func_stack_frame_non_standard, "aw" + .long \func - . + .popsection +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_STACK_VALIDATION */ @@ -122,6 +131,8 @@ struct unwind_hint { #define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 .endm +.macro STACK_FRAME_NON_STANDARD func:req +.endm #endif #endif /* CONFIG_STACK_VALIDATION */ diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 728752917785..ce58cff99b66 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) __SYSCALL(__NR_process_madvise, sys_process_madvise) #define __NR_epoll_pwait2 441 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) +#define __NR_mount_setattr 442 +__SYSCALL(__NR_mount_setattr, sys_mount_setattr) #undef __NR_syscalls -#define __NR_syscalls 442 +#define __NR_syscalls 443 /* * 32 bit systems traditionally used different diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 808b48a93330..0827037c5484 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -1,11 +1,10 @@ -/** - * \file drm.h +/* * Header for the Direct Rendering Manager * - * \author Rickard E. (Rik) Faith <faith@valinux.com> + * Author: Rickard E. (Rik) Faith <faith@valinux.com> * - * \par Acknowledgments: - * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic \c cmpxchg. + * Acknowledgments: + * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic cmpxchg. */ /* @@ -85,7 +84,7 @@ typedef unsigned int drm_context_t; typedef unsigned int drm_drawable_t; typedef unsigned int drm_magic_t; -/** +/* * Cliprect. * * \warning: If you change this structure, make sure you change @@ -101,7 +100,7 @@ struct drm_clip_rect { unsigned short y2; }; -/** +/* * Drawable information. */ struct drm_drawable_info { @@ -109,7 +108,7 @@ struct drm_drawable_info { struct drm_clip_rect *rects; }; -/** +/* * Texture region, */ struct drm_tex_region { @@ -120,7 +119,7 @@ struct drm_tex_region { unsigned int age; }; -/** +/* * Hardware lock. * * The lock structure is a simple cache-line aligned integer. To avoid @@ -132,7 +131,7 @@ struct drm_hw_lock { char padding[60]; /**< Pad to cache line */ }; -/** +/* * DRM_IOCTL_VERSION ioctl argument type. * * \sa drmGetVersion(). @@ -149,7 +148,7 @@ struct drm_version { char __user *desc; /**< User-space buffer to hold desc */ }; -/** +/* * DRM_IOCTL_GET_UNIQUE ioctl argument type. * * \sa drmGetBusid() and drmSetBusId(). @@ -168,7 +167,7 @@ struct drm_block { int unused; }; -/** +/* * DRM_IOCTL_CONTROL ioctl argument type. * * \sa drmCtlInstHandler() and drmCtlUninstHandler(). @@ -183,7 +182,7 @@ struct drm_control { int irq; }; -/** +/* * Type of memory to map. */ enum drm_map_type { @@ -195,7 +194,7 @@ enum drm_map_type { _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ }; -/** +/* * Memory mapping flags. */ enum drm_map_flags { @@ -214,7 +213,7 @@ struct drm_ctx_priv_map { void *handle; /**< Handle of map */ }; -/** +/* * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls * argument type. * @@ -231,7 +230,7 @@ struct drm_map { /* Private data */ }; -/** +/* * DRM_IOCTL_GET_CLIENT ioctl argument type. */ struct drm_client { @@ -263,7 +262,7 @@ enum drm_stat_type { /* Add to the *END* of the list */ }; -/** +/* * DRM_IOCTL_GET_STATS ioctl argument type. */ struct drm_stats { @@ -274,7 +273,7 @@ struct drm_stats { } data[15]; }; -/** +/* * Hardware locking flags. */ enum drm_lock_flags { @@ -289,7 +288,7 @@ enum drm_lock_flags { _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ }; -/** +/* * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. * * \sa drmGetLock() and drmUnlock(). @@ -299,7 +298,7 @@ struct drm_lock { enum drm_lock_flags flags; }; -/** +/* * DMA flags * * \warning @@ -328,7 +327,7 @@ enum drm_dma_flags { _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ }; -/** +/* * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. * * \sa drmAddBufs(). @@ -351,7 +350,7 @@ struct drm_buf_desc { */ }; -/** +/* * DRM_IOCTL_INFO_BUFS ioctl argument type. */ struct drm_buf_info { @@ -359,7 +358,7 @@ struct drm_buf_info { struct drm_buf_desc __user *list; }; -/** +/* * DRM_IOCTL_FREE_BUFS ioctl argument type. */ struct drm_buf_free { @@ -367,7 +366,7 @@ struct drm_buf_free { int __user *list; }; -/** +/* * Buffer information * * \sa drm_buf_map. @@ -379,7 +378,7 @@ struct drm_buf_pub { void __user *address; /**< Address of buffer */ }; -/** +/* * DRM_IOCTL_MAP_BUFS ioctl argument type. */ struct drm_buf_map { @@ -392,7 +391,7 @@ struct drm_buf_map { struct drm_buf_pub __user *list; /**< Buffer information */ }; -/** +/* * DRM_IOCTL_DMA ioctl argument type. * * Indices here refer to the offset into the buffer list in drm_buf_get. @@ -417,7 +416,7 @@ enum drm_ctx_flags { _DRM_CONTEXT_2DONLY = 0x02 }; -/** +/* * DRM_IOCTL_ADD_CTX ioctl argument type. * * \sa drmCreateContext() and drmDestroyContext(). @@ -427,7 +426,7 @@ struct drm_ctx { enum drm_ctx_flags flags; }; -/** +/* * DRM_IOCTL_RES_CTX ioctl argument type. */ struct drm_ctx_res { @@ -435,14 +434,14 @@ struct drm_ctx_res { struct drm_ctx __user *contexts; }; -/** +/* * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. */ struct drm_draw { drm_drawable_t handle; }; -/** +/* * DRM_IOCTL_UPDATE_DRAW ioctl argument type. */ typedef enum { @@ -456,14 +455,14 @@ struct drm_update_draw { unsigned long long data; }; -/** +/* * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. */ struct drm_auth { drm_magic_t magic; }; -/** +/* * DRM_IOCTL_IRQ_BUSID ioctl argument type. * * \sa drmGetInterruptFromBusID(). @@ -505,7 +504,7 @@ struct drm_wait_vblank_reply { long tval_usec; }; -/** +/* * DRM_IOCTL_WAIT_VBLANK ioctl argument type. * * \sa drmWaitVBlank(). @@ -518,7 +517,7 @@ union drm_wait_vblank { #define _DRM_PRE_MODESET 1 #define _DRM_POST_MODESET 2 -/** +/* * DRM_IOCTL_MODESET_CTL ioctl argument type * * \sa drmModesetCtl(). @@ -528,7 +527,7 @@ struct drm_modeset_ctl { __u32 cmd; }; -/** +/* * DRM_IOCTL_AGP_ENABLE ioctl argument type. * * \sa drmAgpEnable(). @@ -537,7 +536,7 @@ struct drm_agp_mode { unsigned long mode; /**< AGP mode */ }; -/** +/* * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. * * \sa drmAgpAlloc() and drmAgpFree(). @@ -549,7 +548,7 @@ struct drm_agp_buffer { unsigned long physical; /**< Physical used by i810 */ }; -/** +/* * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. * * \sa drmAgpBind() and drmAgpUnbind(). @@ -559,7 +558,7 @@ struct drm_agp_binding { unsigned long offset; /**< In bytes -- will round to page boundary */ }; -/** +/* * DRM_IOCTL_AGP_INFO ioctl argument type. * * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), @@ -580,7 +579,7 @@ struct drm_agp_info { unsigned short id_device; }; -/** +/* * DRM_IOCTL_SG_ALLOC ioctl argument type. */ struct drm_scatter_gather { @@ -588,7 +587,7 @@ struct drm_scatter_gather { unsigned long handle; /**< Used for mapping / unmapping */ }; -/** +/* * DRM_IOCTL_SET_VERSION ioctl argument type. */ struct drm_set_version { @@ -598,14 +597,14 @@ struct drm_set_version { int drm_dd_minor; }; -/** DRM_IOCTL_GEM_CLOSE ioctl argument type */ +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ struct drm_gem_close { /** Handle of the object to be closed. */ __u32 handle; __u32 pad; }; -/** DRM_IOCTL_GEM_FLINK ioctl argument type */ +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ struct drm_gem_flink { /** Handle for the object being named */ __u32 handle; @@ -614,7 +613,7 @@ struct drm_gem_flink { __u32 name; }; -/** DRM_IOCTL_GEM_OPEN ioctl argument type */ +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ struct drm_gem_open { /** Name of object being opened */ __u32 name; @@ -652,7 +651,7 @@ struct drm_gem_open { #define DRM_CAP_SYNCOBJ 0x13 #define DRM_CAP_SYNCOBJ_TIMELINE 0x14 -/** DRM_IOCTL_GET_CAP ioctl argument type */ +/* DRM_IOCTL_GET_CAP ioctl argument type */ struct drm_get_cap { __u64 capability; __u64 value; @@ -678,7 +677,9 @@ struct drm_get_cap { /** * DRM_CLIENT_CAP_ATOMIC * - * If set to 1, the DRM core will expose atomic properties to userspace + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. */ #define DRM_CLIENT_CAP_ATOMIC 3 @@ -698,7 +699,7 @@ struct drm_get_cap { */ #define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 -/** DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ struct drm_set_client_cap { __u64 capability; __u64 value; @@ -950,7 +951,7 @@ extern "C" { #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) -/** +/* * Device specific ioctls should only be in their respective headers * The device specific ioctl range is from 0x40 to 0x9f. * Generic IOCTLS restart at 0xA0. @@ -961,7 +962,7 @@ extern "C" { #define DRM_COMMAND_BASE 0x40 #define DRM_COMMAND_END 0xA0 -/** +/* * Header for events written back to userspace on the drm fd. The * type defines the type of event, the length specifies the total * length of the event (including the header), and user_data is diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index fa1f3d62f9a6..1987e2ea79a3 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -177,8 +177,9 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) +#define I915_PMU_SOFTWARE_GT_AWAKE_TIME __I915_PMU_OTHER(4) -#define I915_PMU_LAST I915_PMU_RC6_RESIDENCY +#define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY /* Each region is a minimum of 16k, and there are at most 255 of them. */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index abb89bbe5635..f6afee209620 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -216,6 +216,20 @@ struct kvm_hyperv_exit { } u; }; +struct kvm_xen_exit { +#define KVM_EXIT_XEN_HCALL 1 + __u32 type; + union { + struct { + __u32 longmode; + __u32 cpl; + __u64 input; + __u64 result; + __u64 params[6]; + } hcall; + } u; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -252,6 +266,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_X86_WRMSR 30 #define KVM_EXIT_DIRTY_RING_FULL 31 #define KVM_EXIT_AP_RESET_HOLD 32 +#define KVM_EXIT_X86_BUS_LOCK 33 +#define KVM_EXIT_XEN 34 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -428,6 +444,8 @@ struct kvm_run { __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ } msr; + /* KVM_EXIT_XEN */ + struct kvm_xen_exit xen; /* Fix the size of the union. */ char padding[256]; }; @@ -1058,6 +1076,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #define KVM_CAP_SYS_HYPERV_CPUID 191 #define KVM_CAP_DIRTY_LOG_RING 192 +#define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_PPC_DAWR1 194 #ifdef KVM_CAP_IRQ_ROUTING @@ -1132,6 +1151,11 @@ struct kvm_x86_mce { #endif #ifdef KVM_CAP_XEN_HVM +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) + struct kvm_xen_hvm_config { __u32 flags; __u32 msr; @@ -1566,6 +1590,57 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_DIRTY_LOG_RING */ #define KVM_RESET_DIRTY_RINGS _IO(KVMIO, 0xc7) +/* Per-VM Xen attributes */ +#define KVM_XEN_HVM_GET_ATTR _IOWR(KVMIO, 0xc8, struct kvm_xen_hvm_attr) +#define KVM_XEN_HVM_SET_ATTR _IOW(KVMIO, 0xc9, struct kvm_xen_hvm_attr) + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + struct { + __u64 gfn; + } shared_info; + __u64 pad[8]; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 + +/* Per-vCPU Xen attributes */ +#define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) +#define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ @@ -1594,6 +1669,8 @@ enum sev_cmd_id { KVM_SEV_DBG_ENCRYPT, /* Guest certificates commands */ KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, KVM_SEV_NR_MAX, }; @@ -1646,6 +1723,12 @@ struct kvm_sev_dbg { __u32 len; }; +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) @@ -1767,4 +1850,7 @@ struct kvm_dirty_gfn { __u64 offset; }; +#define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) +#define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) + #endif /* __LINUX_KVM_H */ diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h index dd8306ea336c..e6524ead2b7b 100644 --- a/tools/include/uapi/linux/mount.h +++ b/tools/include/uapi/linux/mount.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_MOUNT_H #define _UAPI_LINUX_MOUNT_H +#include <linux/types.h> + /* * These are the fs-independent mount-flags: up to 32 flags are supported * @@ -117,5 +119,19 @@ enum fsconfig_command { #define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ #define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ #define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ + +/* + * mount_setattr() + */ +struct mount_attr { + __u64 attr_set; + __u64 attr_clr; + __u64 propagation; + __u64 userns_fd; +}; + +/* List of all mount_attr versions. */ +#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/include/uapi/linux/openat2.h b/tools/include/uapi/linux/openat2.h index 58b1eb711360..a5feb7604948 100644 --- a/tools/include/uapi/linux/openat2.h +++ b/tools/include/uapi/linux/openat2.h @@ -35,5 +35,9 @@ struct open_how { #define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." be scoped inside the dirfd (similar to chroot(2)). */ +#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be + completed through cached lookup. May + return -EAGAIN if that's not + possible. */ #endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/tools/kvm/kvm_stat/kvm_stat.service b/tools/kvm/kvm_stat/kvm_stat.service index 71aabaffe779..8f13b843d5b4 100644 --- a/tools/kvm/kvm_stat/kvm_stat.service +++ b/tools/kvm/kvm_stat/kvm_stat.service @@ -9,6 +9,7 @@ Type=simple ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv ExecReload=/bin/kill -HUP $MAINPID Restart=always +RestartSec=60s SyslogIdentifier=kvm_stat SyslogLevel=debug diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..e9eb6a6e80d2 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -215,7 +215,7 @@ define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ fi; \ - $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2' + $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2' endef install_lib: all_cmd diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 2f9d685bd522..0911aea4cdbe 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -462,7 +462,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) return err; case BTF_KIND_ARRAY: - return btf_dump_order_type(d, btf_array(t)->type, through_ptr); + return btf_dump_order_type(d, btf_array(t)->type, false); case BTF_KIND_STRUCT: case BTF_KIND_UNION: { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d43cc3f29dae..4181d178ee7b 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1181,7 +1181,8 @@ static int bpf_object__elf_init(struct bpf_object *obj) if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) { pr_warn("elf: failed to get section names strings from %s: %s\n", obj->path, elf_errmsg(-1)); - return -LIBBPF_ERRNO__FORMAT; + err = -LIBBPF_ERRNO__FORMAT; + goto errout; } /* Old LLVM set e_machine to EM_NONE */ diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 4dd73de00b6f..d2cb28e9ef52 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -40,7 +40,7 @@ static int libbpf_netlink_open(__u32 *nl_pid) memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (sock < 0) return -errno; diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index ffbb588724d8..526fc35c0b23 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -610,15 +610,16 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) if (fd < 0) continue; + memset(&map_info, 0, map_len); err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); if (err) { close(fd); continue; } - if (!strcmp(map_info.name, "xsks_map")) { + if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) { ctx->xsks_map_fd = fd; - continue; + break; } close(fd); diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c index 17465d454a0e..a0aaf385cbb5 100644 --- a/tools/lib/perf/evlist.c +++ b/tools/lib/perf/evlist.c @@ -26,13 +26,10 @@ void perf_evlist__init(struct perf_evlist *evlist) { - int i; - - for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) - INIT_HLIST_HEAD(&evlist->heads[i]); INIT_LIST_HEAD(&evlist->entries); evlist->nr_entries = 0; fdarray__init(&evlist->pollfd, 64); + perf_evlist__reset_id_hash(evlist); } static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, @@ -237,6 +234,14 @@ static void perf_evlist__id_hash(struct perf_evlist *evlist, hlist_add_head(&sid->node, &evlist->heads[hash]); } +void perf_evlist__reset_id_hash(struct perf_evlist *evlist) +{ + int i; + + for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i) + INIT_HLIST_HEAD(&evlist->heads[i]); +} + void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, int cpu, int thread, u64 id) diff --git a/tools/lib/perf/include/internal/evlist.h b/tools/lib/perf/include/internal/evlist.h index 2d0fa02b036f..212c29063ad4 100644 --- a/tools/lib/perf/include/internal/evlist.h +++ b/tools/lib/perf/include/internal/evlist.h @@ -124,4 +124,6 @@ int perf_evlist__id_add_fd(struct perf_evlist *evlist, struct perf_evsel *evsel, int cpu, int thread, int fd); +void perf_evlist__reset_id_hash(struct perf_evlist *evlist); + #endif /* __LIBPERF_INTERNAL_EVLIST_H */ diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore index 45cefda24c7b..14236db3677f 100644 --- a/tools/objtool/.gitignore +++ b/tools/objtool/.gitignore @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only arch/x86/lib/inat-tables.c -objtool +/objtool fixdep diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt index 0542e46c7552..30f38fdc0d56 100644 --- a/tools/objtool/Documentation/stack-validation.txt +++ b/tools/objtool/Documentation/stack-validation.txt @@ -315,13 +315,15 @@ they mean, and suggestions for how to fix them. function tracing inserts additional calls, which is not obvious from the sources). -10. file.o: warning: func()+0x5c: alternative modifies stack - - This means that an alternative includes instructions that modify the - stack. The problem is that there is only one ORC unwind table, this means - that the ORC unwind entries must be valid for each of the alternatives. - The easiest way to enforce this is to ensure alternatives do not contain - any ORC entries, which in turn implies the above constraint. +10. file.o: warning: func()+0x5c: stack layout conflict in alternatives + + This means that in the use of the alternative() or ALTERNATIVE() + macro, the code paths have conflicting modifications to the stack. + The problem is that there is only one ORC unwind table, which means + that the ORC unwind entries must be consistent for all possible + instruction boundaries regardless of which code has been patched. + This limitation can be overcome by massaging the alternatives with + NOPs to shift the stack changes around so they no longer conflict. 11. file.o: warning: unannotated intra-function call diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 5cdb19036d7f..92ce4fce7bc7 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -27,6 +27,7 @@ all: $(OBJTOOL) INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ -I$(srctree)/tools/arch/$(SRCARCH)/include \ + -I$(srctree)/tools/objtool/include \ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) @@ -46,10 +47,6 @@ ifeq ($(SRCARCH),x86) SUBCMD_ORC := y endif -ifeq ($(SUBCMD_ORC),y) - CFLAGS += -DINSN_USE_ORC -endif - export SUBCMD_CHECK SUBCMD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index cde9c36e40ae..549813cff8ab 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -11,11 +11,11 @@ #include "../../../arch/x86/lib/inat.c" #include "../../../arch/x86/lib/insn.c" -#include "../../check.h" -#include "../../elf.h" -#include "../../arch.h" -#include "../../warn.h" #include <asm/orc_types.h> +#include <objtool/check.h> +#include <objtool/elf.h> +#include <objtool/arch.h> +#include <objtool/warn.h> static unsigned char op_to_cfi_reg[][2] = { {CFI_AX, CFI_R8}, @@ -222,15 +222,38 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, break; case 0x89: - if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) { + if (rex_w && !rex_r && modrm_reg == 4) { - /* mov %rsp, reg */ - ADD_OP(op) { - op->src.type = OP_SRC_REG; - op->src.reg = CFI_SP; - op->dest.type = OP_DEST_REG; - op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + if (modrm_mod == 3) { + /* mov %rsp, reg */ + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_SP; + op->dest.type = OP_DEST_REG; + op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + } + break; + + } else { + /* skip nontrivial SIB */ + if (modrm_rm == 4 && !(sib == 0x24 && rex_b == rex_x)) + break; + + /* skip RIP relative displacement */ + if (modrm_rm == 5 && modrm_mod == 0) + break; + + /* mov %rsp, disp(%reg) */ + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = CFI_SP; + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; + op->dest.offset = insn.displacement.value; + } + break; } + break; } @@ -259,8 +282,10 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->dest.reg = CFI_BP; op->dest.offset = insn.displacement.value; } + break; + } - } else if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) { + if (rex_w && !rex_b && modrm_rm == 4 && sib == 0x24) { /* mov reg, disp(%rsp) */ ADD_OP(op) { @@ -270,6 +295,7 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, op->dest.reg = CFI_SP; op->dest.offset = insn.displacement.value; } + break; } break; @@ -563,8 +589,8 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) state->cfa.offset = 8; /* initial RA (return address) */ - state->regs[16].base = CFI_CFA; - state->regs[16].offset = -8; + state->regs[CFI_RA].base = CFI_CFA; + state->regs[CFI_RA].offset = -8; } const char *arch_nop_insn(int len) diff --git a/tools/objtool/arch/x86/include/cfi_regs.h b/tools/objtool/arch/x86/include/arch/cfi_regs.h index 79bc517efba8..79bc517efba8 100644 --- a/tools/objtool/arch/x86/include/cfi_regs.h +++ b/tools/objtool/arch/x86/include/arch/cfi_regs.h diff --git a/tools/objtool/arch/x86/include/arch_elf.h b/tools/objtool/arch/x86/include/arch/elf.h index 69cc4264b28a..69cc4264b28a 100644 --- a/tools/objtool/arch/x86/include/arch_elf.h +++ b/tools/objtool/arch/x86/include/arch/elf.h diff --git a/tools/objtool/arch/x86/include/arch/endianness.h b/tools/objtool/arch/x86/include/arch/endianness.h new file mode 100644 index 000000000000..7c362527da20 --- /dev/null +++ b/tools/objtool/arch/x86/include/arch/endianness.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ARCH_ENDIANNESS_H +#define _ARCH_ENDIANNESS_H + +#include <endian.h> + +#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN + +#endif /* _ARCH_ENDIANNESS_H */ diff --git a/tools/objtool/arch/x86/include/arch_special.h b/tools/objtool/arch/x86/include/arch/special.h index d818b2bffa02..d818b2bffa02 100644 --- a/tools/objtool/arch/x86/include/arch_special.h +++ b/tools/objtool/arch/x86/include/arch/special.h diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index fd4af88c0ea5..e707d9bcd161 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <string.h> -#include "../../special.h" -#include "../../builtin.h" +#include <objtool/special.h> +#include <objtool/builtin.h> #define X86_FEATURE_POPCNT (4 * 32 + 23) #define X86_FEATURE_SMAP (9 * 32 + 20) @@ -48,7 +48,7 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, * replacement group. */ return insn->offset == special_alt->new_off && - (insn->type == INSN_CALL || is_static_jump(insn)); + (insn->type == INSN_CALL || is_jump(insn)); } /* diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index c6d199bfd0ae..c3a85d8f6c5c 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -15,10 +15,10 @@ #include <subcmd/parse-options.h> #include <string.h> -#include "builtin.h" -#include "objtool.h" +#include <objtool/builtin.h> +#include <objtool/objtool.h> -bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux; +bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr; static const char * const check_usage[] = { "objtool check [<options>] file.o", @@ -34,13 +34,15 @@ const struct option check_options[] = { OPT_BOOLEAN('a', "uaccess", &uaccess, "enable uaccess checking"), OPT_BOOLEAN('s', "stats", &stats, "print statistics"), OPT_BOOLEAN('d', "duplicate", &validate_dup, "duplicate validation for vmlinux.o"), + OPT_BOOLEAN('n', "noinstr", &noinstr, "noinstr validation for vmlinux.o"), OPT_BOOLEAN('l', "vmlinux", &vmlinux, "vmlinux.o validation"), + OPT_BOOLEAN('M', "mcount", &mcount, "generate __mcount_loc"), OPT_END(), }; int cmd_check(int argc, const char **argv) { - const char *objname, *s; + const char *objname; struct objtool_file *file; int ret; @@ -51,10 +53,6 @@ int cmd_check(int argc, const char **argv) objname = argv[0]; - s = strstr(objname, "vmlinux.o"); - if (s && !s[9]) - vmlinux = true; - file = objtool_open_read(objname); if (!file) return 1; diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c index 7b31121fa60b..8273bbf7cebb 100644 --- a/tools/objtool/builtin-orc.c +++ b/tools/objtool/builtin-orc.c @@ -13,8 +13,8 @@ */ #include <string.h> -#include "builtin.h" -#include "objtool.h" +#include <objtool/builtin.h> +#include <objtool/objtool.h> static const char *orc_usage[] = { "objtool orc generate [<options>] file.o", @@ -51,11 +51,7 @@ int cmd_orc(int argc, const char **argv) if (list_empty(&file->insn_list)) return 0; - ret = create_orc(file); - if (ret) - return ret; - - ret = create_orc_sections(file); + ret = orc_create(file); if (ret) return ret; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f2e5e5ce1a05..5e5388a38e2a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -6,21 +6,20 @@ #include <string.h> #include <stdlib.h> -#include "builtin.h" -#include "cfi.h" -#include "arch.h" -#include "check.h" -#include "special.h" -#include "warn.h" -#include "arch_elf.h" +#include <arch/elf.h> +#include <objtool/builtin.h> +#include <objtool/cfi.h> +#include <objtool/arch.h> +#include <objtool/check.h> +#include <objtool/special.h> +#include <objtool/warn.h> +#include <objtool/endianness.h> #include <linux/objtool.h> #include <linux/hashtable.h> #include <linux/kernel.h> #include <linux/static_call_types.h> -#define FAKE_JUMP_OFFSET -1 - struct alternative { struct list_head list; struct instruction *insn; @@ -111,15 +110,20 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, static bool is_sibling_call(struct instruction *insn) { + /* + * Assume only ELF functions can make sibling calls. This ensures + * sibling call detection consistency between vmlinux.o and individual + * objects. + */ + if (!insn->func) + return false; + /* An indirect jump is either a sibling call or a jump to a table. */ if (insn->type == INSN_JUMP_DYNAMIC) return list_empty(&insn->alts); - if (!is_static_jump(insn)) - return false; - /* add_jump_destinations() sets insn->call_dest for sibling calls. */ - return !!insn->call_dest; + return (is_static_jump(insn) && insn->call_dest); } /* @@ -156,6 +160,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "machine_real_restart", "rewind_stack_do_exit", "kunit_try_catch_throw", + "xen_start_kernel", }; if (!func) @@ -244,7 +249,7 @@ static void init_insn_state(struct insn_state *state, struct section *sec) * not correctly determine insn->call_dest->sec (external symbols do * not have a section). */ - if (vmlinux && sec) + if (vmlinux && noinstr && sec) state->noinstr = sec->noinstr; } @@ -543,6 +548,78 @@ static int create_static_call_sections(struct objtool_file *file) return 0; } +static int create_mcount_loc_sections(struct objtool_file *file) +{ + struct section *sec, *reloc_sec; + struct reloc *reloc; + unsigned long *loc; + struct instruction *insn; + int idx; + + sec = find_section_by_name(file->elf, "__mcount_loc"); + if (sec) { + INIT_LIST_HEAD(&file->mcount_loc_list); + WARN("file already has __mcount_loc section, skipping"); + return 0; + } + + if (list_empty(&file->mcount_loc_list)) + return 0; + + idx = 0; + list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) + idx++; + + sec = elf_create_section(file->elf, "__mcount_loc", 0, sizeof(unsigned long), idx); + if (!sec) + return -1; + + reloc_sec = elf_create_reloc_section(file->elf, sec, SHT_RELA); + if (!reloc_sec) + return -1; + + idx = 0; + list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) { + + loc = (unsigned long *)sec->data->d_buf + idx; + memset(loc, 0, sizeof(unsigned long)); + + reloc = malloc(sizeof(*reloc)); + if (!reloc) { + perror("malloc"); + return -1; + } + memset(reloc, 0, sizeof(*reloc)); + + if (insn->sec->sym) { + reloc->sym = insn->sec->sym; + reloc->addend = insn->offset; + } else { + reloc->sym = find_symbol_containing(insn->sec, insn->offset); + + if (!reloc->sym) { + WARN("missing symbol for insn at offset 0x%lx\n", + insn->offset); + return -1; + } + + reloc->addend = insn->offset - reloc->sym->offset; + } + + reloc->type = R_X86_64_64; + reloc->offset = idx * sizeof(unsigned long); + reloc->sec = reloc_sec; + elf_add_reloc(file->elf, reloc); + + idx++; + } + + if (elf_rebuild_reloc_section(file->elf, reloc_sec)) + return -1; + + return 0; +} + /* * Warnings shouldn't be reported for ignored functions. */ @@ -589,7 +666,7 @@ static void add_ignores(struct objtool_file *file) static const char *uaccess_safe_builtin[] = { /* KASAN */ "kasan_report", - "check_memory_region", + "kasan_check_range", /* KASAN out-of-line */ "__asan_loadN_noabort", "__asan_load1_noabort", @@ -787,22 +864,16 @@ static int add_jump_destinations(struct objtool_file *file) if (!is_static_jump(insn)) continue; - if (insn->offset == FAKE_JUMP_OFFSET) - continue; - reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); + insn->offset, insn->len); if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); } else if (reloc->sym->type == STT_SECTION) { dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); - } else if (reloc->sym->sec->idx) { - dest_sec = reloc->sym->sec; - dest_off = reloc->sym->sym.st_value + - arch_dest_reloc_offset(reloc->addend); - } else if (strstr(reloc->sym->name, "_indirect_thunk_")) { + } else if (!strncmp(reloc->sym->name, "__x86_indirect_thunk_", 21) || + !strncmp(reloc->sym->name, "__x86_retpoline_", 16)) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. @@ -814,14 +885,21 @@ static int add_jump_destinations(struct objtool_file *file) insn->retpoline_safe = true; continue; - } else { - /* external sibling call */ + } else if (insn->func) { + /* internal or external sibling call (with reloc) */ insn->call_dest = reloc->sym; if (insn->call_dest->static_call_tramp) { list_add_tail(&insn->static_call_node, &file->static_call_list); } continue; + } else if (reloc->sym->sec->idx) { + dest_sec = reloc->sym->sec; + dest_off = reloc->sym->sym.st_value + + arch_dest_reloc_offset(reloc->addend); + } else { + /* non-func asm code jumping to another file */ + continue; } insn->jump_dest = find_insn(file, dest_sec, dest_off); @@ -862,15 +940,15 @@ static int add_jump_destinations(struct objtool_file *file) * case where the parent function's only reference to a * subfunction is through a jump table. */ - if (!strstr(insn->func->name, ".cold.") && - strstr(insn->jump_dest->func->name, ".cold.")) { + if (!strstr(insn->func->name, ".cold") && + strstr(insn->jump_dest->func->name, ".cold")) { insn->func->cfunc = insn->jump_dest->func; insn->jump_dest->func->pfunc = insn->func; } else if (insn->jump_dest->func->pfunc != insn->func->pfunc && insn->jump_dest->offset == insn->jump_dest->func->offset) { - /* internal sibling call */ + /* internal sibling call (without reloc) */ insn->call_dest = insn->jump_dest->func; if (insn->call_dest->static_call_tramp) { list_add_tail(&insn->static_call_node, @@ -969,6 +1047,22 @@ static int add_call_destinations(struct objtool_file *file) insn->type = INSN_NOP; } + if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) { + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + arch_nop_insn(insn->len)); + + insn->type = INSN_NOP; + + list_add_tail(&insn->mcount_loc_node, + &file->mcount_loc_list); + } + /* * Whatever stack impact regular CALLs have, should be undone * by the RETURN of the called function. @@ -983,73 +1077,83 @@ static int add_call_destinations(struct objtool_file *file) } /* - * The .alternatives section requires some extra special care, over and above - * what other special sections require: - * - * 1. Because alternatives are patched in-place, we need to insert a fake jump - * instruction at the end so that validate_branch() skips all the original - * replaced instructions when validating the new instruction path. - * - * 2. An added wrinkle is that the new instruction length might be zero. In - * that case the old instructions are replaced with noops. We simulate that - * by creating a fake jump as the only new instruction. - * - * 3. In some cases, the alternative section includes an instruction which - * conditionally jumps to the _end_ of the entry. We have to modify these - * jumps' destinations to point back to .text rather than the end of the - * entry in .altinstr_replacement. + * The .alternatives section requires some extra special care over and above + * other special sections because alternatives are patched in place. */ static int handle_group_alt(struct objtool_file *file, struct special_alt *special_alt, struct instruction *orig_insn, struct instruction **new_insn) { - static unsigned int alt_group_next_index = 1; - struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL; - unsigned int alt_group = alt_group_next_index++; + struct instruction *last_orig_insn, *last_new_insn = NULL, *insn, *nop = NULL; + struct alt_group *orig_alt_group, *new_alt_group; unsigned long dest_off; + + orig_alt_group = malloc(sizeof(*orig_alt_group)); + if (!orig_alt_group) { + WARN("malloc failed"); + return -1; + } + orig_alt_group->cfi = calloc(special_alt->orig_len, + sizeof(struct cfi_state *)); + if (!orig_alt_group->cfi) { + WARN("calloc failed"); + return -1; + } + last_orig_insn = NULL; insn = orig_insn; sec_for_each_insn_from(file, insn) { if (insn->offset >= special_alt->orig_off + special_alt->orig_len) break; - insn->alt_group = alt_group; + insn->alt_group = orig_alt_group; last_orig_insn = insn; } + orig_alt_group->orig_group = NULL; + orig_alt_group->first_insn = orig_insn; + orig_alt_group->last_insn = last_orig_insn; + + + new_alt_group = malloc(sizeof(*new_alt_group)); + if (!new_alt_group) { + WARN("malloc failed"); + return -1; + } - if (next_insn_same_sec(file, last_orig_insn)) { - fake_jump = malloc(sizeof(*fake_jump)); - if (!fake_jump) { + if (special_alt->new_len < special_alt->orig_len) { + /* + * Insert a fake nop at the end to make the replacement + * alt_group the same size as the original. This is needed to + * allow propagate_alt_cfi() to do its magic. When the last + * instruction affects the stack, the instruction after it (the + * nop) will propagate the new state to the shared CFI array. + */ + nop = malloc(sizeof(*nop)); + if (!nop) { WARN("malloc failed"); return -1; } - memset(fake_jump, 0, sizeof(*fake_jump)); - INIT_LIST_HEAD(&fake_jump->alts); - INIT_LIST_HEAD(&fake_jump->stack_ops); - init_cfi_state(&fake_jump->cfi); + memset(nop, 0, sizeof(*nop)); + INIT_LIST_HEAD(&nop->alts); + INIT_LIST_HEAD(&nop->stack_ops); + init_cfi_state(&nop->cfi); - fake_jump->sec = special_alt->new_sec; - fake_jump->offset = FAKE_JUMP_OFFSET; - fake_jump->type = INSN_JUMP_UNCONDITIONAL; - fake_jump->jump_dest = list_next_entry(last_orig_insn, list); - fake_jump->func = orig_insn->func; + nop->sec = special_alt->new_sec; + nop->offset = special_alt->new_off + special_alt->new_len; + nop->len = special_alt->orig_len - special_alt->new_len; + nop->type = INSN_NOP; + nop->func = orig_insn->func; + nop->alt_group = new_alt_group; + nop->ignore = orig_insn->ignore_alts; } if (!special_alt->new_len) { - if (!fake_jump) { - WARN("%s: empty alternative at end of section", - special_alt->orig_sec->name); - return -1; - } - - *new_insn = fake_jump; - return 0; + *new_insn = nop; + goto end; } - last_new_insn = NULL; - alt_group = alt_group_next_index++; insn = *new_insn; sec_for_each_insn_from(file, insn) { struct reloc *alt_reloc; @@ -1061,7 +1165,7 @@ static int handle_group_alt(struct objtool_file *file, insn->ignore = orig_insn->ignore_alts; insn->func = orig_insn->func; - insn->alt_group = alt_group; + insn->alt_group = new_alt_group; /* * Since alternative replacement code is copy/pasted by the @@ -1088,14 +1192,8 @@ static int handle_group_alt(struct objtool_file *file, continue; dest_off = arch_jump_destination(insn); - if (dest_off == special_alt->new_off + special_alt->new_len) { - if (!fake_jump) { - WARN("%s: alternative jump to end of section", - special_alt->orig_sec->name); - return -1; - } - insn->jump_dest = fake_jump; - } + if (dest_off == special_alt->new_off + special_alt->new_len) + insn->jump_dest = next_insn_same_sec(file, last_orig_insn); if (!insn->jump_dest) { WARN_FUNC("can't find alternative jump destination", @@ -1110,9 +1208,13 @@ static int handle_group_alt(struct objtool_file *file, return -1; } - if (fake_jump) - list_add(&fake_jump->list, &last_new_insn->list); - + if (nop) + list_add(&nop->list, &last_new_insn->list); +end: + new_alt_group->orig_group = orig_alt_group; + new_alt_group->first_insn = *new_insn; + new_alt_group->last_insn = nop ? : last_new_insn; + new_alt_group->cfi = orig_alt_group->cfi; return 0; } @@ -1404,13 +1506,20 @@ static int add_jump_table_alts(struct objtool_file *file) return 0; } +static void set_func_state(struct cfi_state *state) +{ + state->cfa = initial_func_cfi.cfa; + memcpy(&state->regs, &initial_func_cfi.regs, + CFI_NUM_REGS * sizeof(struct cfi_reg)); + state->stack_size = initial_func_cfi.cfa.offset; +} + static int read_unwind_hints(struct objtool_file *file) { struct section *sec, *relocsec; struct reloc *reloc; struct unwind_hint *hint; struct instruction *insn; - struct cfi_reg *cfa; int i; sec = find_section_by_name(file->elf, ".discard.unwind_hints"); @@ -1445,22 +1554,20 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - cfa = &insn->cfi.cfa; + insn->hint = true; - if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) { - insn->ret_offset = hint->sp_offset; + if (hint->type == UNWIND_HINT_TYPE_FUNC) { + set_func_state(&insn->cfi); continue; } - insn->hint = true; - if (arch_decode_hint_reg(insn, hint->sp_reg)) { WARN_FUNC("unsupported unwind_hint sp base reg %d", insn->sec, insn->offset, hint->sp_reg); return -1; } - cfa->offset = hint->sp_offset; + insn->cfi.cfa.offset = bswap_if_needed(hint->sp_offset); insn->cfi.type = hint->type; insn->cfi.end = hint->end; } @@ -1716,27 +1823,18 @@ static bool is_fentry_call(struct instruction *insn) static bool has_modified_stack_frame(struct instruction *insn, struct insn_state *state) { - u8 ret_offset = insn->ret_offset; struct cfi_state *cfi = &state->cfi; int i; if (cfi->cfa.base != initial_func_cfi.cfa.base || cfi->drap) return true; - if (cfi->cfa.offset != initial_func_cfi.cfa.offset + ret_offset) + if (cfi->cfa.offset != initial_func_cfi.cfa.offset) return true; - if (cfi->stack_size != initial_func_cfi.cfa.offset + ret_offset) + if (cfi->stack_size != initial_func_cfi.cfa.offset) return true; - /* - * If there is a ret offset hint then don't check registers - * because a callee-saved register might have been pushed on - * the stack. - */ - if (ret_offset) - return false; - for (i = 0; i < CFI_NUM_REGS; i++) { if (cfi->regs[i].base != initial_func_cfi.regs[i].base || cfi->regs[i].offset != initial_func_cfi.regs[i].offset) @@ -1746,12 +1844,20 @@ static bool has_modified_stack_frame(struct instruction *insn, struct insn_state return false; } +static bool check_reg_frame_pos(const struct cfi_reg *reg, + int expected_offset) +{ + return reg->base == CFI_CFA && + reg->offset == expected_offset; +} + static bool has_valid_stack_frame(struct insn_state *state) { struct cfi_state *cfi = &state->cfi; - if (cfi->cfa.base == CFI_BP && cfi->regs[CFI_BP].base == CFI_CFA && - cfi->regs[CFI_BP].offset == -16) + if (cfi->cfa.base == CFI_BP && + check_reg_frame_pos(&cfi->regs[CFI_BP], -cfi->cfa.offset) && + check_reg_frame_pos(&cfi->regs[CFI_RA], -cfi->cfa.offset + 8)) return true; if (cfi->drap && cfi->regs[CFI_BP].base == CFI_BP) @@ -1880,8 +1986,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, case OP_SRC_REG: if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP && cfa->base == CFI_SP && - regs[CFI_BP].base == CFI_CFA && - regs[CFI_BP].offset == -cfa->offset) { + check_reg_frame_pos(®s[CFI_BP], -cfa->offset)) { /* mov %rsp, %rbp */ cfa->base = op->dest.reg; @@ -1941,12 +2046,58 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, cfa->offset = -cfi->vals[op->src.reg].offset; cfi->stack_size = cfa->offset; + } else if (cfa->base == CFI_SP && + cfi->vals[op->src.reg].base == CFI_SP_INDIRECT && + cfi->vals[op->src.reg].offset == cfa->offset) { + + /* + * Stack swizzle: + * + * 1: mov %rsp, (%[tos]) + * 2: mov %[tos], %rsp + * ... + * 3: pop %rsp + * + * Where: + * + * 1 - places a pointer to the previous + * stack at the Top-of-Stack of the + * new stack. + * + * 2 - switches to the new stack. + * + * 3 - pops the Top-of-Stack to restore + * the original stack. + * + * Note: we set base to SP_INDIRECT + * here and preserve offset. Therefore + * when the unwinder reaches ToS it + * will dereference SP and then add the + * offset to find the next frame, IOW: + * (%rsp) + offset. + */ + cfa->base = CFI_SP_INDIRECT; + } else { cfa->base = CFI_UNDEFINED; cfa->offset = 0; } } + else if (op->dest.reg == CFI_SP && + cfi->vals[op->src.reg].base == CFI_SP_INDIRECT && + cfi->vals[op->src.reg].offset == cfa->offset) { + + /* + * The same stack swizzle case 2) as above. But + * because we can't change cfa->base, case 3) + * will become a regular POP. Pretend we're a + * PUSH so things don't go unbalanced. + */ + cfi->stack_size += 8; + } + + break; case OP_SRC_ADD: @@ -1966,6 +2117,17 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; } + if (!cfi->drap && op->src.reg == CFI_SP && + op->dest.reg == CFI_BP && cfa->base == CFI_SP && + check_reg_frame_pos(®s[CFI_BP], -cfa->offset + op->src.offset)) { + + /* lea disp(%rsp), %rbp */ + cfa->base = CFI_BP; + cfa->offset -= op->src.offset; + cfi->bp_scratch = false; + break; + } + if (op->src.reg == CFI_SP && cfa->base == CFI_SP) { /* drap: lea disp(%rsp), %drap */ @@ -2032,6 +2194,13 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, case OP_SRC_POP: case OP_SRC_POPF: + if (op->dest.reg == CFI_SP && cfa->base == CFI_SP_INDIRECT) { + + /* pop %rsp; # restore from a stack swizzle */ + cfa->base = CFI_SP; + break; + } + if (!cfi->drap && op->dest.reg == cfa->base) { /* pop %rbp */ @@ -2060,6 +2229,14 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, break; case OP_SRC_REG_INDIRECT: + if (!cfi->drap && op->dest.reg == cfa->base && + op->dest.reg == CFI_BP) { + + /* mov disp(%rsp), %rbp */ + cfa->base = CFI_SP; + cfa->offset = cfi->stack_size; + } + if (cfi->drap && op->src.reg == CFI_BP && op->src.offset == cfi->drap_offset) { @@ -2081,6 +2258,12 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, /* mov disp(%rbp), %reg */ /* mov disp(%rsp), %reg */ restore_reg(cfi, op->dest.reg); + + } else if (op->src.reg == CFI_SP && + op->src.offset == regs[op->dest.reg].offset + cfi->stack_size) { + + /* mov disp(%rsp), %reg */ + restore_reg(cfi, op->dest.reg); } break; @@ -2158,6 +2341,18 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, /* mov reg, disp(%rsp) */ save_reg(cfi, op->src.reg, CFI_CFA, op->dest.offset - cfi->cfa.offset); + + } else if (op->dest.reg == CFI_SP) { + + /* mov reg, disp(%rsp) */ + save_reg(cfi, op->src.reg, CFI_CFA, + op->dest.offset - cfi->stack_size); + + } else if (op->src.reg == CFI_SP && op->dest.offset == 0) { + + /* mov %rsp, (%reg); # setup a stack swizzle. */ + cfi->vals[op->dest.reg].base = CFI_SP_INDIRECT; + cfi->vals[op->dest.reg].offset = cfa->offset; } break; @@ -2205,22 +2400,50 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, return 0; } +/* + * The stack layouts of alternatives instructions can sometimes diverge when + * they have stack modifications. That's fine as long as the potential stack + * layouts don't conflict at any given potential instruction boundary. + * + * Flatten the CFIs of the different alternative code streams (both original + * and replacement) into a single shared CFI array which can be used to detect + * conflicts and nicely feed a linear array of ORC entries to the unwinder. + */ +static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn) +{ + struct cfi_state **alt_cfi; + int group_off; + + if (!insn->alt_group) + return 0; + + alt_cfi = insn->alt_group->cfi; + group_off = insn->offset - insn->alt_group->first_insn->offset; + + if (!alt_cfi[group_off]) { + alt_cfi[group_off] = &insn->cfi; + } else { + if (memcmp(alt_cfi[group_off], &insn->cfi, sizeof(struct cfi_state))) { + WARN_FUNC("stack layout conflict in alternatives", + insn->sec, insn->offset); + return -1; + } + } + + return 0; +} + static int handle_insn_ops(struct instruction *insn, struct insn_state *state) { struct stack_op *op; list_for_each_entry(op, &insn->stack_ops, list) { - struct cfi_state old_cfi = state->cfi; - int res; - res = update_cfi_state(insn, &state->cfi, op); - if (res) - return res; + if (update_cfi_state(insn, &state->cfi, op)) + return 1; - if (insn->alt_group && memcmp(&state->cfi, &old_cfi, sizeof(struct cfi_state))) { - WARN_FUNC("alternative modifies stack", insn->sec, insn->offset); - return -1; - } + if (!insn->alt_group) + continue; if (op->dest.type == OP_DEST_PUSHF) { if (!state->uaccess_stack) { @@ -2410,28 +2633,20 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct return 0; } -/* - * Alternatives should not contain any ORC entries, this in turn means they - * should not contain any CFI ops, which implies all instructions should have - * the same same CFI state. - * - * It is possible to constuct alternatives that have unreachable holes that go - * unreported (because they're NOPs), such holes would result in CFI_UNDEFINED - * states which then results in ORC entries, which we just said we didn't want. - * - * Avoid them by copying the CFI entry of the first instruction into the whole - * alternative. - */ -static void fill_alternative_cfi(struct objtool_file *file, struct instruction *insn) +static struct instruction *next_insn_to_validate(struct objtool_file *file, + struct instruction *insn) { - struct instruction *first_insn = insn; - int alt_group = insn->alt_group; + struct alt_group *alt_group = insn->alt_group; - sec_for_each_insn_continue(file, insn) { - if (insn->alt_group != alt_group) - break; - insn->cfi = first_insn->cfi; - } + /* + * Simulate the fact that alternatives are patched in-place. When the + * end of a replacement alt_group is reached, redirect objtool flow to + * the end of the original alt_group. + */ + if (alt_group && insn == alt_group->last_insn && alt_group->orig_group) + return next_insn_same_sec(file, alt_group->orig_group->last_insn); + + return next_insn_same_sec(file, insn); } /* @@ -2452,7 +2667,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, sec = insn->sec; while (1) { - next_insn = next_insn_same_sec(file, insn); + next_insn = next_insn_to_validate(file, insn); if (file->c_file && func && insn->func && func != insn->func->pfunc) { WARN("%s() falls through to next function %s()", @@ -2485,6 +2700,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, insn->visited |= visited; + if (propagate_alt_cfi(file, insn)) + return 1; + if (!insn->ignore_alts && !list_empty(&insn->alts)) { bool skip_orig = false; @@ -2500,9 +2718,6 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, } } - if (insn->alt_group) - fill_alternative_cfi(file, insn); - if (skip_orig) return 0; } @@ -2540,7 +2755,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_CONDITIONAL: case INSN_JUMP_UNCONDITIONAL: - if (func && is_sibling_call(insn)) { + if (is_sibling_call(insn)) { ret = validate_sibling_call(insn, &state); if (ret) return ret; @@ -2562,7 +2777,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_DYNAMIC: case INSN_JUMP_DYNAMIC_CONDITIONAL: - if (func && is_sibling_call(insn)) { + if (is_sibling_call(insn)) { ret = validate_sibling_call(insn, &state); if (ret) return ret; @@ -2605,15 +2820,19 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_STD: - if (state.df) + if (state.df) { WARN_FUNC("recursive STD", sec, insn->offset); + return 1; + } state.df = true; break; case INSN_CLD: - if (!state.df && func) + if (!state.df && func) { WARN_FUNC("redundant CLD", sec, insn->offset); + return 1; + } state.df = false; break; @@ -2736,9 +2955,6 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio !strcmp(insn->sec->name, ".altinstr_aux")) return true; - if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->offset == FAKE_JUMP_OFFSET) - return true; - if (!insn->func) return false; @@ -2824,10 +3040,7 @@ static int validate_section(struct objtool_file *file, struct section *sec) continue; init_insn_state(&state, sec); - state.cfi.cfa = initial_func_cfi.cfa; - memcpy(&state.cfi.regs, &initial_func_cfi.regs, - CFI_NUM_REGS * sizeof(struct cfi_reg)); - state.cfi.stack_size = initial_func_cfi.cfa.offset; + set_func_state(&state.cfi); warnings += validate_symbol(file, sec, func, &state); } @@ -2940,6 +3153,13 @@ int check(struct objtool_file *file) goto out; warnings += ret; + if (mcount) { + ret = create_mcount_loc_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + out: /* * For now, don't fail the kernel build on fatal warnings. These diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index e85988ce04f1..93fa833a49a5 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -15,10 +15,10 @@ #include <string.h> #include <unistd.h> #include <errno.h> -#include "builtin.h" +#include <objtool/builtin.h> -#include "elf.h" -#include "warn.h" +#include <objtool/elf.h> +#include <objtool/warn.h> #define MAX_NAME_LEN 128 @@ -814,25 +814,27 @@ static int elf_rebuild_rel_reloc_section(struct section *sec, int nr) { struct reloc *reloc; int idx = 0, size; - GElf_Rel *relocs; + void *buf; /* Allocate a buffer for relocations */ - size = nr * sizeof(*relocs); - relocs = malloc(size); - if (!relocs) { + size = nr * sizeof(GElf_Rel); + buf = malloc(size); + if (!buf) { perror("malloc"); return -1; } - sec->data->d_buf = relocs; + sec->data->d_buf = buf; sec->data->d_size = size; + sec->data->d_type = ELF_T_REL; sec->sh.sh_size = size; idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { - relocs[idx].r_offset = reloc->offset; - relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + reloc->rel.r_offset = reloc->offset; + reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + gelf_update_rel(sec->data, idx, &reloc->rel); idx++; } @@ -843,26 +845,28 @@ static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) { struct reloc *reloc; int idx = 0, size; - GElf_Rela *relocs; + void *buf; /* Allocate a buffer for relocations with addends */ - size = nr * sizeof(*relocs); - relocs = malloc(size); - if (!relocs) { + size = nr * sizeof(GElf_Rela); + buf = malloc(size); + if (!buf) { perror("malloc"); return -1; } - sec->data->d_buf = relocs; + sec->data->d_buf = buf; sec->data->d_size = size; + sec->data->d_type = ELF_T_RELA; sec->sh.sh_size = size; idx = 0; list_for_each_entry(reloc, &sec->reloc_list, list) { - relocs[idx].r_offset = reloc->offset; - relocs[idx].r_addend = reloc->addend; - relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + reloc->rela.r_offset = reloc->offset; + reloc->rela.r_addend = reloc->addend; + reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + gelf_update_rela(sec->data, idx, &reloc->rela); idx++; } diff --git a/tools/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 4a84c3081b8e..6ff0685f5cc5 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -8,12 +8,8 @@ #include <stdbool.h> #include <linux/list.h> -#include "objtool.h" -#include "cfi.h" - -#ifdef INSN_USE_ORC -#include <asm/orc_types.h> -#endif +#include <objtool/objtool.h> +#include <objtool/cfi.h> enum insn_type { INSN_JUMP_CONDITIONAL, diff --git a/tools/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 85c979caa367..2502bb27de17 100644 --- a/tools/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -8,7 +8,7 @@ #include <subcmd/parse-options.h> extern const struct option check_options[]; -extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux; +extern bool no_fp, no_unreachable, retpoline, module, backtrace, uaccess, stats, validate_dup, vmlinux, mcount, noinstr; extern int cmd_check(int argc, const char **argv); extern int cmd_orc(int argc, const char **argv); diff --git a/tools/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h index c7c59c6a44ee..fd5cb0bed9bf 100644 --- a/tools/objtool/cfi.h +++ b/tools/objtool/include/objtool/cfi.h @@ -6,7 +6,7 @@ #ifndef _OBJTOOL_CFI_H #define _OBJTOOL_CFI_H -#include "cfi_regs.h" +#include <arch/cfi_regs.h> #define CFI_UNDEFINED -1 #define CFI_CFA -2 diff --git a/tools/objtool/check.h b/tools/objtool/include/objtool/check.h index 5ec00a4b891b..f5be798107bc 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -7,8 +7,8 @@ #define _CHECK_H #include <stdbool.h> -#include "cfi.h" -#include "arch.h" +#include <objtool/cfi.h> +#include <objtool/arch.h> struct insn_state { struct cfi_state cfi; @@ -19,10 +19,28 @@ struct insn_state { s8 instr; }; +struct alt_group { + /* + * Pointer from a replacement group to the original group. NULL if it + * *is* the original group. + */ + struct alt_group *orig_group; + + /* First and last instructions in the group */ + struct instruction *first_insn, *last_insn; + + /* + * Byte-offset-addressed len-sized array of pointers to CFI structs. + * This is shared with the other alt_groups in the same alternative. + */ + struct cfi_state **cfi; +}; + struct instruction { struct list_head list; struct hlist_node hash; struct list_head static_call_node; + struct list_head mcount_loc_node; struct section *sec; unsigned long offset; unsigned int len; @@ -33,8 +51,7 @@ struct instruction { bool retpoline_safe; s8 instr; u8 visited; - u8 ret_offset; - int alt_group; + struct alt_group *alt_group; struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src; @@ -43,9 +60,6 @@ struct instruction { struct symbol *func; struct list_head stack_ops; struct cfi_state cfi; -#ifdef INSN_USE_ORC - struct orc_entry orc; -#endif }; static inline bool is_static_jump(struct instruction *insn) @@ -54,6 +68,17 @@ static inline bool is_static_jump(struct instruction *insn) insn->type == INSN_JUMP_UNCONDITIONAL; } +static inline bool is_dynamic_jump(struct instruction *insn) +{ + return insn->type == INSN_JUMP_DYNAMIC || + insn->type == INSN_JUMP_DYNAMIC_CONDITIONAL; +} + +static inline bool is_jump(struct instruction *insn) +{ + return is_static_jump(insn) || is_dynamic_jump(insn); +} + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset); diff --git a/tools/objtool/elf.h b/tools/objtool/include/objtool/elf.h index e6890cc70a25..e6890cc70a25 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h new file mode 100644 index 000000000000..10241341eff3 --- /dev/null +++ b/tools/objtool/include/objtool/endianness.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_ENDIANNESS_H +#define _OBJTOOL_ENDIANNESS_H + +#include <arch/endianness.h> +#include <linux/kernel.h> +#include <endian.h> + +#ifndef __TARGET_BYTE_ORDER +#error undefined arch __TARGET_BYTE_ORDER +#endif + +#if __BYTE_ORDER != __TARGET_BYTE_ORDER +#define __NEED_BSWAP 1 +#else +#define __NEED_BSWAP 0 +#endif + +/* + * Does a byte swap if target endianness doesn't match the host, i.e. cross + * compilation for little endian on big endian and vice versa. + * To be used for multi-byte values conversion, which are read from / about + * to be written to a target native endianness ELF file. + */ +#define bswap_if_needed(val) \ +({ \ + __typeof__(val) __ret; \ + switch (sizeof(val)) { \ + case 8: __ret = __NEED_BSWAP ? bswap_64(val) : (val); break; \ + case 4: __ret = __NEED_BSWAP ? bswap_32(val) : (val); break; \ + case 2: __ret = __NEED_BSWAP ? bswap_16(val) : (val); break; \ + default: \ + BUILD_BUG(); break; \ + } \ + __ret; \ +}) + +#endif /* _OBJTOOL_ENDIANNESS_H */ diff --git a/tools/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index 4125d4578b23..e68e37476c15 100644 --- a/tools/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -10,7 +10,7 @@ #include <linux/list.h> #include <linux/hashtable.h> -#include "elf.h" +#include <objtool/elf.h> #define __weak __attribute__((weak)) @@ -19,6 +19,7 @@ struct objtool_file { struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 20); struct list_head static_call_list; + struct list_head mcount_loc_list; bool ignore_unreachables, c_file, hints, rodata; }; @@ -26,7 +27,6 @@ struct objtool_file *objtool_open_read(const char *_objname); int check(struct objtool_file *file); int orc_dump(const char *objname); -int create_orc(struct objtool_file *file); -int create_orc_sections(struct objtool_file *file); +int orc_create(struct objtool_file *file); #endif /* _OBJTOOL_H */ diff --git a/tools/objtool/special.h b/tools/objtool/include/objtool/special.h index abddf38ef334..8a09f4e9d480 100644 --- a/tools/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -7,8 +7,8 @@ #define _SPECIAL_H #include <stdbool.h> -#include "check.h" -#include "elf.h" +#include <objtool/check.h> +#include <objtool/elf.h> #define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" diff --git a/tools/objtool/warn.h b/tools/objtool/include/objtool/warn.h index 7799f60de80a..d99c4675e4a5 100644 --- a/tools/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -11,7 +11,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> -#include "elf.h" +#include <objtool/elf.h> extern const char *objname; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 9df0cd86d310..7b97ce499405 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -21,9 +21,9 @@ #include <subcmd/pager.h> #include <linux/kernel.h> -#include "builtin.h" -#include "objtool.h" -#include "warn.h" +#include <objtool/builtin.h> +#include <objtool/objtool.h> +#include <objtool/warn.h> struct cmd_struct { const char *name; @@ -62,6 +62,7 @@ struct objtool_file *objtool_open_read(const char *_objname) INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); INIT_LIST_HEAD(&file.static_call_list); + INIT_LIST_HEAD(&file.mcount_loc_list); file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment"); file.ignore_unreachables = no_unreachable; file.hints = false; diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index 5e6a95368d35..f5a8508c42d6 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -6,8 +6,9 @@ #include <unistd.h> #include <linux/objtool.h> #include <asm/orc_types.h> -#include "objtool.h" -#include "warn.h" +#include <objtool/objtool.h> +#include <objtool/warn.h> +#include <objtool/endianness.h> static const char *reg_name(unsigned int reg) { @@ -54,7 +55,7 @@ static void print_reg(unsigned int reg, int offset) if (reg == ORC_REG_BP_INDIRECT) printf("(bp%+d)", offset); else if (reg == ORC_REG_SP_INDIRECT) - printf("(sp%+d)", offset); + printf("(sp)%+d", offset); else if (reg == ORC_REG_UNDEFINED) printf("(und)"); else @@ -197,11 +198,11 @@ int orc_dump(const char *_objname) printf(" sp:"); - print_reg(orc[i].sp_reg, orc[i].sp_offset); + print_reg(orc[i].sp_reg, bswap_if_needed(orc[i].sp_offset)); printf(" bp:"); - print_reg(orc[i].bp_reg, orc[i].bp_offset); + print_reg(orc[i].bp_reg, bswap_if_needed(orc[i].bp_offset)); printf(" type:%s end:%d\n", orc_type_name(orc[i].type), orc[i].end); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 9ce68b385a1b..738aa5021bc4 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -9,93 +9,91 @@ #include <linux/objtool.h> #include <asm/orc_types.h> -#include "check.h" -#include "warn.h" +#include <objtool/check.h> +#include <objtool/warn.h> +#include <objtool/endianness.h> -int create_orc(struct objtool_file *file) +static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi) { - struct instruction *insn; + struct instruction *insn = container_of(cfi, struct instruction, cfi); + struct cfi_reg *bp = &cfi->regs[CFI_BP]; - for_each_insn(file, insn) { - struct orc_entry *orc = &insn->orc; - struct cfi_reg *cfa = &insn->cfi.cfa; - struct cfi_reg *bp = &insn->cfi.regs[CFI_BP]; + memset(orc, 0, sizeof(*orc)); - if (!insn->sec->text) - continue; - - orc->end = insn->cfi.end; - - if (cfa->base == CFI_UNDEFINED) { - orc->sp_reg = ORC_REG_UNDEFINED; - continue; - } + orc->end = cfi->end; - switch (cfa->base) { - case CFI_SP: - orc->sp_reg = ORC_REG_SP; - break; - case CFI_SP_INDIRECT: - orc->sp_reg = ORC_REG_SP_INDIRECT; - break; - case CFI_BP: - orc->sp_reg = ORC_REG_BP; - break; - case CFI_BP_INDIRECT: - orc->sp_reg = ORC_REG_BP_INDIRECT; - break; - case CFI_R10: - orc->sp_reg = ORC_REG_R10; - break; - case CFI_R13: - orc->sp_reg = ORC_REG_R13; - break; - case CFI_DI: - orc->sp_reg = ORC_REG_DI; - break; - case CFI_DX: - orc->sp_reg = ORC_REG_DX; - break; - default: - WARN_FUNC("unknown CFA base reg %d", - insn->sec, insn->offset, cfa->base); - return -1; - } + if (cfi->cfa.base == CFI_UNDEFINED) { + orc->sp_reg = ORC_REG_UNDEFINED; + return 0; + } - switch(bp->base) { - case CFI_UNDEFINED: - orc->bp_reg = ORC_REG_UNDEFINED; - break; - case CFI_CFA: - orc->bp_reg = ORC_REG_PREV_SP; - break; - case CFI_BP: - orc->bp_reg = ORC_REG_BP; - break; - default: - WARN_FUNC("unknown BP base reg %d", - insn->sec, insn->offset, bp->base); - return -1; - } + switch (cfi->cfa.base) { + case CFI_SP: + orc->sp_reg = ORC_REG_SP; + break; + case CFI_SP_INDIRECT: + orc->sp_reg = ORC_REG_SP_INDIRECT; + break; + case CFI_BP: + orc->sp_reg = ORC_REG_BP; + break; + case CFI_BP_INDIRECT: + orc->sp_reg = ORC_REG_BP_INDIRECT; + break; + case CFI_R10: + orc->sp_reg = ORC_REG_R10; + break; + case CFI_R13: + orc->sp_reg = ORC_REG_R13; + break; + case CFI_DI: + orc->sp_reg = ORC_REG_DI; + break; + case CFI_DX: + orc->sp_reg = ORC_REG_DX; + break; + default: + WARN_FUNC("unknown CFA base reg %d", + insn->sec, insn->offset, cfi->cfa.base); + return -1; + } - orc->sp_offset = cfa->offset; - orc->bp_offset = bp->offset; - orc->type = insn->cfi.type; + switch (bp->base) { + case CFI_UNDEFINED: + orc->bp_reg = ORC_REG_UNDEFINED; + break; + case CFI_CFA: + orc->bp_reg = ORC_REG_PREV_SP; + break; + case CFI_BP: + orc->bp_reg = ORC_REG_BP; + break; + default: + WARN_FUNC("unknown BP base reg %d", + insn->sec, insn->offset, bp->base); + return -1; } + orc->sp_offset = cfi->cfa.offset; + orc->bp_offset = bp->offset; + orc->type = cfi->type; + return 0; } -static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relocsec, - unsigned int idx, struct section *insn_sec, - unsigned long insn_off, struct orc_entry *o) +static int write_orc_entry(struct elf *elf, struct section *orc_sec, + struct section *ip_rsec, unsigned int idx, + struct section *insn_sec, unsigned long insn_off, + struct orc_entry *o) { struct orc_entry *orc; struct reloc *reloc; /* populate ORC data */ - orc = (struct orc_entry *)u_sec->data->d_buf + idx; + orc = (struct orc_entry *)orc_sec->data->d_buf + idx; memcpy(orc, o, sizeof(*orc)); + orc->sp_offset = bswap_if_needed(orc->sp_offset); + orc->bp_offset = bswap_if_needed(orc->bp_offset); /* populate reloc for ip */ reloc = malloc(sizeof(*reloc)); @@ -114,102 +112,149 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti reloc->type = R_X86_64_PC32; reloc->offset = idx * sizeof(int); - reloc->sec = ip_relocsec; + reloc->sec = ip_rsec; elf_add_reloc(elf, reloc); return 0; } -int create_orc_sections(struct objtool_file *file) -{ - struct instruction *insn, *prev_insn; - struct section *sec, *u_sec, *ip_relocsec; - unsigned int idx; +struct orc_list_entry { + struct list_head list; + struct orc_entry orc; + struct section *insn_sec; + unsigned long insn_off; +}; - struct orc_entry empty = { - .sp_reg = ORC_REG_UNDEFINED, - .bp_reg = ORC_REG_UNDEFINED, - .type = UNWIND_HINT_TYPE_CALL, - }; +static int orc_list_add(struct list_head *orc_list, struct orc_entry *orc, + struct section *sec, unsigned long offset) +{ + struct orc_list_entry *entry = malloc(sizeof(*entry)); - sec = find_section_by_name(file->elf, ".orc_unwind"); - if (sec) { - WARN("file already has .orc_unwind section, skipping"); + if (!entry) { + WARN("malloc failed"); return -1; } - /* count the number of needed orcs */ - idx = 0; - for_each_sec(file, sec) { - if (!sec->text) - continue; - - prev_insn = NULL; - sec_for_each_insn(file, sec, insn) { - if (!prev_insn || - memcmp(&insn->orc, &prev_insn->orc, - sizeof(struct orc_entry))) { - idx++; - } - prev_insn = insn; - } - - /* section terminator */ - if (prev_insn) - idx++; - } - if (!idx) - return -1; + entry->orc = *orc; + entry->insn_sec = sec; + entry->insn_off = offset; + list_add_tail(&entry->list, orc_list); + return 0; +} - /* create .orc_unwind_ip and .rela.orc_unwind_ip sections */ - sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), idx); - if (!sec) - return -1; +static unsigned long alt_group_len(struct alt_group *alt_group) +{ + return alt_group->last_insn->offset + + alt_group->last_insn->len - + alt_group->first_insn->offset; +} - ip_relocsec = elf_create_reloc_section(file->elf, sec, SHT_RELA); - if (!ip_relocsec) - return -1; +int orc_create(struct objtool_file *file) +{ + struct section *sec, *ip_rsec, *orc_sec; + unsigned int nr = 0, idx = 0; + struct orc_list_entry *entry; + struct list_head orc_list; - /* create .orc_unwind section */ - u_sec = elf_create_section(file->elf, ".orc_unwind", 0, - sizeof(struct orc_entry), idx); + struct orc_entry null = { + .sp_reg = ORC_REG_UNDEFINED, + .bp_reg = ORC_REG_UNDEFINED, + .type = UNWIND_HINT_TYPE_CALL, + }; - /* populate sections */ - idx = 0; + /* Build a deduplicated list of ORC entries: */ + INIT_LIST_HEAD(&orc_list); for_each_sec(file, sec) { + struct orc_entry orc, prev_orc = {0}; + struct instruction *insn; + bool empty = true; + if (!sec->text) continue; - prev_insn = NULL; sec_for_each_insn(file, sec, insn) { - if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc, - sizeof(struct orc_entry))) { + struct alt_group *alt_group = insn->alt_group; + int i; - if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx, - insn->sec, insn->offset, - &insn->orc)) + if (!alt_group) { + if (init_orc_entry(&orc, &insn->cfi)) return -1; + if (!memcmp(&prev_orc, &orc, sizeof(orc))) + continue; + if (orc_list_add(&orc_list, &orc, sec, + insn->offset)) + return -1; + nr++; + prev_orc = orc; + empty = false; + continue; + } - idx++; + /* + * Alternatives can have different stack layout + * possibilities (but they shouldn't conflict). + * Instead of traversing the instructions, use the + * alt_group's flattened byte-offset-addressed CFI + * array. + */ + for (i = 0; i < alt_group_len(alt_group); i++) { + struct cfi_state *cfi = alt_group->cfi[i]; + if (!cfi) + continue; + if (init_orc_entry(&orc, cfi)) + return -1; + if (!memcmp(&prev_orc, &orc, sizeof(orc))) + continue; + if (orc_list_add(&orc_list, &orc, insn->sec, + insn->offset + i)) + return -1; + nr++; + prev_orc = orc; + empty = false; } - prev_insn = insn; - } - /* section terminator */ - if (prev_insn) { - if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx, - prev_insn->sec, - prev_insn->offset + prev_insn->len, - &empty)) - return -1; + /* Skip to the end of the alt_group */ + insn = alt_group->last_insn; + } - idx++; + /* Add a section terminator */ + if (!empty) { + orc_list_add(&orc_list, &null, sec, sec->len); + nr++; } } + if (!nr) + return 0; + + /* Create .orc_unwind, .orc_unwind_ip and .rela.orc_unwind_ip sections: */ + sec = find_section_by_name(file->elf, ".orc_unwind"); + if (sec) { + WARN("file already has .orc_unwind section, skipping"); + return -1; + } + orc_sec = elf_create_section(file->elf, ".orc_unwind", 0, + sizeof(struct orc_entry), nr); + if (!orc_sec) + return -1; + + sec = elf_create_section(file->elf, ".orc_unwind_ip", 0, sizeof(int), nr); + if (!sec) + return -1; + ip_rsec = elf_create_reloc_section(file->elf, sec, SHT_RELA); + if (!ip_rsec) + return -1; + + /* Write ORC entries to sections: */ + list_for_each_entry(entry, &orc_list, list) { + if (write_orc_entry(file->elf, orc_sec, ip_rsec, idx++, + entry->insn_sec, entry->insn_off, + &entry->orc)) + return -1; + } - if (elf_rebuild_reloc_section(file->elf, ip_relocsec)) + if (elf_rebuild_reloc_section(file->elf, ip_rsec)) return -1; return 0; diff --git a/tools/objtool/special.c b/tools/objtool/special.c index 1a2420febd08..2c7fbda7b055 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -11,10 +11,11 @@ #include <stdlib.h> #include <string.h> -#include "builtin.h" -#include "special.h" -#include "warn.h" -#include "arch_special.h" +#include <arch/special.h> +#include <objtool/builtin.h> +#include <objtool/special.h> +#include <objtool/warn.h> +#include <objtool/endianness.h> struct special_entry { const char *sec; @@ -77,8 +78,9 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, if (entry->feature) { unsigned short feature; - feature = *(unsigned short *)(sec->data->d_buf + offset + - entry->feature); + feature = bswap_if_needed(*(unsigned short *)(sec->data->d_buf + + offset + + entry->feature)); arch_handle_alternative(feature, alt); } diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c index 7843e9a7a72f..8314e824db4a 100644 --- a/tools/objtool/weak.c +++ b/tools/objtool/weak.c @@ -7,7 +7,7 @@ #include <stdbool.h> #include <errno.h> -#include "objtool.h" +#include <objtool/objtool.h> #define UNSUPPORTED(name) \ ({ \ @@ -25,12 +25,7 @@ int __weak orc_dump(const char *_objname) UNSUPPORTED("orc"); } -int __weak create_orc(struct objtool_file *file) -{ - UNSUPPORTED("orc"); -} - -int __weak create_orc_sections(struct objtool_file *file) +int __weak orc_create(struct objtool_file *file) { UNSUPPORTED("orc"); } diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt index c0a66400a960..9af8b8dfb7b6 100644 --- a/tools/perf/Documentation/perf-evlist.txt +++ b/tools/perf/Documentation/perf-evlist.txt @@ -29,7 +29,7 @@ OPTIONS Show just the sample frequency used for each event. -v:: ---verbose=:: +--verbose:: Show all fields. -g:: diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt index 1e91121bac0f..6e82b7cc0bf0 100644 --- a/tools/perf/Documentation/perf-ftrace.txt +++ b/tools/perf/Documentation/perf-ftrace.txt @@ -28,8 +28,8 @@ OPTIONS specified: function_graph or function. -v:: ---verbose=:: - Verbosity level. +--verbose:: + Increase the verbosity level. -F:: --funcs:: diff --git a/tools/perf/Documentation/perf-kallsyms.txt b/tools/perf/Documentation/perf-kallsyms.txt index f3c620951f6e..c97527df8ecd 100644 --- a/tools/perf/Documentation/perf-kallsyms.txt +++ b/tools/perf/Documentation/perf-kallsyms.txt @@ -20,5 +20,5 @@ modules). OPTIONS ------- -v:: ---verbose=:: +--verbose:: Increase verbosity level, showing details about symbol table loading, etc. diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index abc9b5d83312..f0da8cf63e9a 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -97,8 +97,8 @@ filter out the startup phase of the program, which is often very different. Filter out events for these pids and for 'trace' itself (comma separated list). -v:: ---verbose=:: - Verbosity level. +--verbose:: + Increase the verbosity level. --no-inherit:: Child tasks do not inherit counters. diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 5345ac70cd83..f6e609673de2 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -607,7 +607,7 @@ arch_errno_hdr_dir := $(srctree)/tools arch_errno_tbl := $(srctree)/tools/perf/trace/beauty/arch_errno_names.sh $(arch_errno_name_array): $(arch_errno_tbl) - $(Q)$(SHELL) '$(arch_errno_tbl)' $(firstword $(CC)) $(arch_errno_hdr_dir) > $@ + $(Q)$(SHELL) '$(arch_errno_tbl)' '$(patsubst -%,,$(CC))' $(arch_errno_hdr_dir) > $@ sync_file_range_arrays := $(beauty_outdir)/sync_file_range_arrays.c sync_file_range_tbls := $(srctree)/tools/perf/trace/beauty/sync_file_range.sh @@ -1001,14 +1001,6 @@ $(INSTALL_DOC_TARGETS): ### Cleaning rules -# -# This is here, not in Makefile.config, because Makefile.config does -# not get included for the clean target: -# -config-clean: - $(call QUIET_CLEAN, config) - $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null - python-clean: $(python-clean) @@ -1048,7 +1040,7 @@ endif # BUILD_BPF_SKEL bpf-skel-clean: $(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS) -clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean config-clean fixdep-clean python-clean bpf-skel-clean +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index bd446aba64f7..c25c878fd06c 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -156,6 +156,10 @@ out: return err; } +#define ETM_SET_OPT_CTXTID (1 << 0) +#define ETM_SET_OPT_TS (1 << 1) +#define ETM_SET_OPT_MASK (ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS) + static int cs_etm_set_option(struct auxtrace_record *itr, struct evsel *evsel, u32 option) { @@ -169,17 +173,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr, !cpu_map__has(online_cpus, i)) continue; - if (option & ETM_OPT_CTXTID) { + if (option & ETM_SET_OPT_CTXTID) { err = cs_etm_set_context_id(itr, evsel, i); if (err) goto out; } - if (option & ETM_OPT_TS) { + if (option & ETM_SET_OPT_TS) { err = cs_etm_set_timestamp(itr, evsel, i); if (err) goto out; } - if (option & ~(ETM_OPT_CTXTID | ETM_OPT_TS)) + if (option & ~(ETM_SET_OPT_MASK)) /* Nothing else is currently supported */ goto out; } @@ -406,7 +410,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, evsel__set_sample_bit(cs_etm_evsel, CPU); err = cs_etm_set_option(itr, cs_etm_evsel, - ETM_OPT_CTXTID | ETM_OPT_TS); + ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS); if (err) goto out; } diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index f744eb5cba88..0b2480cf3e47 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -9,9 +9,7 @@ # 0 nospu restart_syscall sys_restart_syscall 1 nospu exit sys_exit -2 32 fork ppc_fork sys_fork -2 64 fork sys_fork -2 spu fork sys_ni_syscall +2 nospu fork sys_fork 3 common read sys_read 4 common write sys_write 5 common open sys_open compat_sys_open @@ -160,9 +158,7 @@ 119 32 sigreturn sys_sigreturn compat_sys_sigreturn 119 64 sigreturn sys_ni_syscall 119 spu sigreturn sys_ni_syscall -120 32 clone ppc_clone sys_clone -120 64 clone sys_clone -120 spu clone sys_ni_syscall +120 nospu clone sys_clone 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall @@ -244,9 +240,7 @@ 186 spu sendfile sys_sendfile64 187 common getpmsg sys_ni_syscall 188 common putpmsg sys_ni_syscall -189 32 vfork ppc_vfork sys_vfork -189 64 vfork sys_vfork -189 spu vfork sys_ni_syscall +189 nospu vfork sys_vfork 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 common readahead sys_readahead compat_sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 @@ -322,9 +316,7 @@ 248 32 clock_nanosleep sys_clock_nanosleep_time32 248 64 clock_nanosleep sys_clock_nanosleep 248 spu clock_nanosleep sys_clock_nanosleep -249 32 swapcontext ppc_swapcontext compat_sys_swapcontext -249 64 swapcontext sys_swapcontext -249 spu swapcontext sys_ni_syscall +249 nospu swapcontext sys_swapcontext compat_sys_swapcontext 250 common tgkill sys_tgkill 251 32 utimes sys_utimes_time32 251 64 utimes sys_utimes @@ -522,12 +514,11 @@ 432 common fsmount sys_fsmount 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open -435 32 clone3 ppc_clone3 sys_clone3 -435 64 clone3 sys_clone3 -435 spu clone3 sys_ni_syscall +435 nospu clone3 sys_clone3 436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index d443423495e5..3abef2144dac 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -444,3 +444,4 @@ 439 common faccessat2 sys_faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr sys_mount_setattr diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile index 8cc6642fce7a..5a9f9a7bf07d 100644 --- a/tools/perf/arch/x86/Makefile +++ b/tools/perf/arch/x86/Makefile @@ -10,10 +10,11 @@ PERF_HAVE_JITDUMP := 1 # Syscall table generation # -out := $(OUTPUT)arch/x86/include/generated/asm -header := $(out)/syscalls_64.c -sys := $(srctree)/tools/perf/arch/x86/entry/syscalls -systbl := $(sys)/syscalltbl.sh +generated := $(OUTPUT)arch/x86/include/generated +out := $(generated)/asm +header := $(out)/syscalls_64.c +sys := $(srctree)/tools/perf/arch/x86/entry/syscalls +systbl := $(sys)/syscalltbl.sh # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') @@ -22,6 +23,6 @@ $(header): $(sys)/syscall_64.tbl $(systbl) $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@ clean:: - $(call QUIET_CLEAN, x86) $(RM) $(header) + $(call QUIET_CLEAN, x86) $(RM) -r $(header) $(generated) archheaders: $(header) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 78672124d28b..7bf01cbe582f 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -363,6 +363,7 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h index 6a54b94f1c25..0e20f3dc69f3 100644 --- a/tools/perf/arch/x86/include/arch-tests.h +++ b/tools/perf/arch/x86/include/arch-tests.h @@ -10,6 +10,7 @@ int test__rdpmc(struct test *test __maybe_unused, int subtest); int test__insn_x86(struct test *test __maybe_unused, int subtest); int test__intel_pt_pkt_decoder(struct test *test, int subtest); int test__bp_modify(struct test *test, int subtest); +int test__x86_sample_parsing(struct test *test, int subtest); #ifdef HAVE_DWARF_UNWIND_SUPPORT struct thread; diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index 36d4f248b51d..28d793390198 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -3,5 +3,6 @@ perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o perf-y += arch-tests.o perf-y += rdpmc.o +perf-y += sample-parsing.o perf-$(CONFIG_AUXTRACE) += insn-x86.o intel-pt-pkt-decoder-test.o perf-$(CONFIG_X86_64) += bp-modify.o diff --git a/tools/perf/arch/x86/tests/arch-tests.c b/tools/perf/arch/x86/tests/arch-tests.c index bc25d727b4e9..71aa67367ad6 100644 --- a/tools/perf/arch/x86/tests/arch-tests.c +++ b/tools/perf/arch/x86/tests/arch-tests.c @@ -31,6 +31,10 @@ struct test arch_tests[] = { }, #endif { + .desc = "x86 Sample parsing", + .func = test__x86_sample_parsing, + }, + { .func = NULL, }, diff --git a/tools/perf/arch/x86/tests/insn-x86.c b/tools/perf/arch/x86/tests/insn-x86.c index f782ef8c5982..4f75ae990140 100644 --- a/tools/perf/arch/x86/tests/insn-x86.c +++ b/tools/perf/arch/x86/tests/insn-x86.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> -#include "../../../../arch/x86/include/asm/insn.h" #include <string.h> #include "debug.h" #include "tests/tests.h" #include "arch-tests.h" +#include "../../../../arch/x86/include/asm/insn.h" #include "intel-pt-decoder/intel-pt-insn-decoder.h" diff --git a/tools/perf/arch/x86/tests/sample-parsing.c b/tools/perf/arch/x86/tests/sample-parsing.c new file mode 100644 index 000000000000..c92db87e4479 --- /dev/null +++ b/tools/perf/arch/x86/tests/sample-parsing.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdbool.h> +#include <inttypes.h> +#include <stdlib.h> +#include <string.h> +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include "event.h" +#include "evsel.h" +#include "debug.h" +#include "util/synthetic-events.h" + +#include "tests/tests.h" +#include "arch-tests.h" + +#define COMP(m) do { \ + if (s1->m != s2->m) { \ + pr_debug("Samples differ at '"#m"'\n"); \ + return false; \ + } \ +} while (0) + +static bool samples_same(const struct perf_sample *s1, + const struct perf_sample *s2, + u64 type) +{ + if (type & PERF_SAMPLE_WEIGHT_STRUCT) + COMP(ins_lat); + + return true; +} + +static int do_test(u64 sample_type) +{ + struct evsel evsel = { + .needs_swap = false, + .core = { + . attr = { + .sample_type = sample_type, + .read_format = 0, + }, + }, + }; + union perf_event *event; + struct perf_sample sample = { + .weight = 101, + .ins_lat = 102, + }; + struct perf_sample sample_out; + size_t i, sz, bufsz; + int err, ret = -1; + + sz = perf_event__sample_event_size(&sample, sample_type, 0); + bufsz = sz + 4096; /* Add a bit for overrun checking */ + event = malloc(bufsz); + if (!event) { + pr_debug("malloc failed\n"); + return -1; + } + + memset(event, 0xff, bufsz); + event->header.type = PERF_RECORD_SAMPLE; + event->header.misc = 0; + event->header.size = sz; + + err = perf_event__synthesize_sample(event, sample_type, 0, &sample); + if (err) { + pr_debug("%s failed for sample_type %#"PRIx64", error %d\n", + "perf_event__synthesize_sample", sample_type, err); + goto out_free; + } + + /* The data does not contain 0xff so we use that to check the size */ + for (i = bufsz; i > 0; i--) { + if (*(i - 1 + (u8 *)event) != 0xff) + break; + } + if (i != sz) { + pr_debug("Event size mismatch: actual %zu vs expected %zu\n", + i, sz); + goto out_free; + } + + evsel.sample_size = __evsel__sample_size(sample_type); + + err = evsel__parse_sample(&evsel, event, &sample_out); + if (err) { + pr_debug("%s failed for sample_type %#"PRIx64", error %d\n", + "evsel__parse_sample", sample_type, err); + goto out_free; + } + + if (!samples_same(&sample, &sample_out, sample_type)) { + pr_debug("parsing failed for sample_type %#"PRIx64"\n", + sample_type); + goto out_free; + } + + ret = 0; +out_free: + free(event); + + return ret; +} + +/** + * test__x86_sample_parsing - test X86 specific sample parsing + * + * This function implements a test that synthesizes a sample event, parses it + * and then checks that the parsed sample matches the original sample. If the + * test passes %0 is returned, otherwise %-1 is returned. + * + * For now, the PERF_SAMPLE_WEIGHT_STRUCT is the only X86 specific sample type. + * The test only checks the PERF_SAMPLE_WEIGHT_STRUCT type. + */ +int test__x86_sample_parsing(struct test *test __maybe_unused, int subtest __maybe_unused) +{ + return do_test(PERF_SAMPLE_WEIGHT_STRUCT); +} diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c index 3e6791531ca5..34d600c51044 100644 --- a/tools/perf/arch/x86/util/archinsn.c +++ b/tools/perf/arch/x86/util/archinsn.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -#include "../../../../arch/x86/include/asm/insn.h" #include "archinsn.h" #include "event.h" #include "machine.h" #include "thread.h" #include "symbol.h" +#include "../../../../arch/x86/include/asm/insn.h" void arch_fetch_insn(struct perf_sample *sample, struct thread *thread, diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 11726ec6285f..20b87e29c96f 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -344,18 +344,22 @@ static void mempol_restore(void) static void bind_to_memnode(int node) { - unsigned long nodemask; + struct bitmask *node_mask; int ret; if (node == NUMA_NO_NODE) return; - BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); - nodemask = 1L << node; + node_mask = numa_allocate_nodemask(); + BUG_ON(!node_mask); - ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); - dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + numa_bitmask_clearall(node_mask); + numa_bitmask_setbit(node_mask, node); + ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1); + dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret); + + numa_bitmask_free(node_mask); BUG_ON(ret); } @@ -876,8 +880,6 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) prctl(0, bytes_worked); } -#define MAX_NR_NODES 64 - /* * Count the number of nodes a process's threads * are spread out on. @@ -888,10 +890,15 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) */ static int count_process_nodes(int process_nr) { - char node_present[MAX_NR_NODES] = { 0, }; + char *node_present; int nodes; int n, t; + node_present = (char *)malloc(g->p.nr_nodes * sizeof(char)); + BUG_ON(!node_present); + for (nodes = 0; nodes < g->p.nr_nodes; nodes++) + node_present[nodes] = 0; + for (t = 0; t < g->p.nr_threads; t++) { struct thread_data *td; int task_nr; @@ -901,17 +908,20 @@ static int count_process_nodes(int process_nr) td = g->threads + task_nr; node = numa_node_of_cpu(td->curr_cpu); - if (node < 0) /* curr_cpu was likely still -1 */ + if (node < 0) /* curr_cpu was likely still -1 */ { + free(node_present); return 0; + } node_present[node] = 1; } nodes = 0; - for (n = 0; n < MAX_NR_NODES; n++) + for (n = 0; n < g->p.nr_nodes; n++) nodes += node_present[n]; + free(node_present); return nodes; } @@ -980,7 +990,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence) { unsigned int loops_done_min, loops_done_max; int process_groups; - int nodes[MAX_NR_NODES]; + int *nodes; int distance; int nr_min; int nr_max; @@ -994,6 +1004,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) if (!g->p.show_convergence && !g->p.measure_convergence) return; + nodes = (int *)malloc(g->p.nr_nodes * sizeof(int)); + BUG_ON(!nodes); for (node = 0; node < g->p.nr_nodes; node++) nodes[node] = 0; @@ -1035,8 +1047,10 @@ static void calc_convergence(double runtime_ns_max, double *convergence) BUG_ON(sum > g->p.nr_tasks); - if (0 && (sum < g->p.nr_tasks)) + if (0 && (sum < g->p.nr_tasks)) { + free(nodes); return; + } /* * Count the number of distinct process groups present @@ -1088,6 +1102,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) } tprintf("\n"); } + + free(nodes); } static void show_summary(double runtime_ns_max, int l, double *convergence) @@ -1413,7 +1429,7 @@ static int init(void) g->p.nr_nodes = numa_max_node() + 1; /* char array in count_process_nodes(): */ - BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + BUG_ON(g->p.nr_nodes < 0); if (g->p.show_quiet && !g->p.show_details) g->p.show_details = -1; diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index cecce93ccc63..488f6e6ba1a5 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -309,11 +309,11 @@ int bench_sched_messaging(int argc, const char **argv) num_groups, num_groups * 2 * num_fds, thread_mode ? "threads" : "processes"); printf(" %14s: %lu.%03lu [sec]\n", "Total time", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; case BENCH_FORMAT_SIMPLE: - printf("%lu.%03lu\n", diff.tv_sec, + printf("%lu.%03lu\n", (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; default: diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index 3c88d1f201f1..a960e7a93aec 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -156,7 +156,7 @@ int bench_sched_pipe(int argc, const char **argv) result_usec += diff.tv_usec; printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); printf(" %14lf usecs/op\n", @@ -168,7 +168,7 @@ int bench_sched_pipe(int argc, const char **argv) case BENCH_FORMAT_SIMPLE: printf("%lu.%03lu\n", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c index 5fe621cff8e9..9b751016f4b6 100644 --- a/tools/perf/bench/syscall.c +++ b/tools/perf/bench/syscall.c @@ -54,7 +54,7 @@ int bench_syscall_basic(int argc, const char **argv) result_usec += diff.tv_usec; printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec/1000)); printf(" %14lf usecs/op\n", @@ -66,7 +66,7 @@ int bench_syscall_basic(int argc, const char **argv) case BENCH_FORMAT_SIMPLE: printf("%lu.%03lu\n", - diff.tv_sec, + (unsigned long) diff.tv_sec, (unsigned long) (diff.tv_usec / 1000)); break; diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 617feaf020f6..7c4a9d424a64 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -161,7 +161,7 @@ static int session_config(struct daemon *daemon, const char *var, const char *va struct daemon_session *session; char name[100]; - if (get_session_name(var, name, sizeof(name))) + if (get_session_name(var, name, sizeof(name) - 1)) return -EINVAL; var = strchr(var, '.'); @@ -373,12 +373,12 @@ static int daemon_session__run(struct daemon_session *session, dup2(fd, 2); close(fd); - if (mkfifo(SESSION_CONTROL, O_RDWR) && errno != EEXIST) { + if (mkfifo(SESSION_CONTROL, 0600) && errno != EEXIST) { perror("failed: create control fifo"); return -1; } - if (mkfifo(SESSION_ACK, O_RDWR) && errno != EEXIST) { + if (mkfifo(SESSION_ACK, 0600) && errno != EEXIST) { perror("failed: create ack fifo"); return -1; } @@ -402,35 +402,42 @@ static pid_t handle_signalfd(struct daemon *daemon) int status; pid_t pid; + /* + * Take signal fd data as pure signal notification and check all + * the sessions state. The reason is that multiple signals can get + * coalesced in kernel and we can receive only single signal even + * if multiple SIGCHLD were generated. + */ err = read(daemon->signal_fd, &si, sizeof(struct signalfd_siginfo)); - if (err != sizeof(struct signalfd_siginfo)) + if (err != sizeof(struct signalfd_siginfo)) { + pr_err("failed to read signal fd\n"); return -1; + } list_for_each_entry(session, &daemon->sessions, list) { + if (session->pid == -1) + continue; - if (session->pid != (int) si.ssi_pid) + pid = waitpid(session->pid, &status, WNOHANG); + if (pid <= 0) continue; - pid = waitpid(session->pid, &status, 0); - if (pid == session->pid) { - if (WIFEXITED(status)) { - pr_info("session '%s' exited, status=%d\n", - session->name, WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { - pr_info("session '%s' killed (signal %d)\n", - session->name, WTERMSIG(status)); - } else if (WIFSTOPPED(status)) { - pr_info("session '%s' stopped (signal %d)\n", - session->name, WSTOPSIG(status)); - } else { - pr_info("session '%s' Unexpected status (0x%x)\n", - session->name, status); - } + if (WIFEXITED(status)) { + pr_info("session '%s' exited, status=%d\n", + session->name, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + pr_info("session '%s' killed (signal %d)\n", + session->name, WTERMSIG(status)); + } else if (WIFSTOPPED(status)) { + pr_info("session '%s' stopped (signal %d)\n", + session->name, WSTOPSIG(status)); + } else { + pr_info("session '%s' Unexpected status (0x%x)\n", + session->name, status); } session->state = KILL; session->pid = -1; - return pid; } return 0; @@ -443,7 +450,6 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d .fd = daemon->signal_fd, .events = POLLIN, }; - pid_t wpid = 0, pid = session->pid; time_t start; start = time(NULL); @@ -452,7 +458,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d int err = poll(&pollfd, 1, 1000); if (err > 0) { - wpid = handle_signalfd(daemon); + handle_signalfd(daemon); } else if (err < 0) { perror("failed: poll\n"); return -1; @@ -460,7 +466,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d if (start + secs < time(NULL)) return -1; - } while (wpid != pid); + } while (session->pid != -1); return 0; } @@ -902,7 +908,9 @@ static void daemon_session__kill(struct daemon_session *session, daemon_session__signal(session, SIGKILL); break; default: - break; + pr_err("failed to wait for session %s\n", + session->name); + return; } how++; @@ -955,7 +963,8 @@ static void daemon__kill(struct daemon *daemon) daemon__signal(daemon, SIGKILL); break; default: - break; + pr_err("failed to wait for sessions\n"); + return; } how++; @@ -1344,7 +1353,7 @@ out: close(sock_fd); if (conf_fd != -1) close(conf_fd); - if (conf_fd != -1) + if (signal_fd != -1) close(signal_fd); pr_info("daemon exited\n"); diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 8f6c784ce629..878e04b1fab7 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1236,7 +1236,8 @@ static int __cmd_diff(void) out_delete: data__for_each_file(i, d) { - perf_session__delete(d->session); + if (!IS_ERR(d->session)) + perf_session__delete(d->session); data__free(d); } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 85b6a46e85b6..7ec18ff57fc4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -3964,9 +3964,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv) evlist__config(evlist, &trace->opts, &callchain_param); - signal(SIGCHLD, sig_handler); - signal(SIGINT, sig_handler); - if (forks) { err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL); if (err < 0) { @@ -4827,6 +4824,8 @@ int cmd_trace(int argc, const char **argv) signal(SIGSEGV, sighandler_dump_stack); signal(SIGFPE, sighandler_dump_stack); + signal(SIGCHLD, sig_handler); + signal(SIGINT, sig_handler); trace.evlist = evlist__new(); trace.sctbl = syscalltbl__new(); diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh index 0cfb3e2cefef..133f0eddbcc4 100644 --- a/tools/perf/perf-archive.sh +++ b/tools/perf/perf-archive.sh @@ -20,9 +20,8 @@ else fi BUILDIDS=$(mktemp /tmp/perf-archive-buildids.XXXXXX) -NOBUILDID=0000000000000000000000000000000000000000 -perf buildid-list -i $PERF_DATA --with-hits | grep -v "^$NOBUILDID " > $BUILDIDS +perf buildid-list -i $PERF_DATA --with-hits | grep -v "^ " > $BUILDIDS if [ ! -s $BUILDIDS ] ; then echo "perf archive: no build-ids found" rm $BUILDIDS || true diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index ec972e0892ab..dd39ce9b0277 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -182,14 +182,20 @@ int test__attr(struct test *test __maybe_unused, int subtest __maybe_unused) struct stat st; char path_perf[PATH_MAX]; char path_dir[PATH_MAX]; + char *exec_path; /* First try development tree tests. */ if (!lstat("./tests", &st)) return run_dir("./tests", "./perf"); + exec_path = get_argv_exec_path(); + if (exec_path == NULL) + return -1; + /* Then installed path. */ - snprintf(path_dir, PATH_MAX, "%s/tests", get_argv_exec_path()); + snprintf(path_dir, PATH_MAX, "%s/tests", exec_path); snprintf(path_perf, PATH_MAX, "%s/perf", BINDIR); + free(exec_path); if (!lstat(path_dir, &st) && !lstat(path_perf, &st)) diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index f57e075b0ed2..c72adbd67386 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -86,7 +86,7 @@ static struct { .msg_load_fail = "check your vmlinux setting?", .target_func = &epoll_pwait_loop, .expect_result = (NR_ITERS + 1) / 2, - .pin = true, + .pin = true, }, #ifdef HAVE_BPF_PROLOGUE { @@ -99,13 +99,6 @@ static struct { .expect_result = (NR_ITERS + 1) / 4, }, #endif - { - .prog_id = LLVM_TESTCASE_BPF_RELOCATION, - .desc = "BPF relocation checker", - .name = "[bpf_relocation_test]", - .msg_compile_fail = "fix 'perf test LLVM' first", - .msg_load_fail = "libbpf error when dealing with relocation", - }, }; static int do_test(struct bpf_object *obj, int (*func)(void), diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 280f0348a09c..2fdc7b2f996e 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -706,13 +706,9 @@ static int do_test_code_reading(bool try_kcore) out_put: thread__put(thread); out_err: - - if (evlist) { - evlist__delete(evlist); - } else { - perf_cpu_map__put(cpus); - perf_thread_map__put(threads); - } + evlist__delete(evlist); + perf_cpu_map__put(cpus); + perf_thread_map__put(threads); machine__delete_threads(machine); machine__delete(machine); diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 29c793ac7d10..0472b110fe65 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -106,6 +106,8 @@ static int cpu_map_print(const char *str) return -1; cpu_map__snprint(map, buf, sizeof(buf)); + perf_cpu_map__put(map); + return !strcmp(buf, str); } diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index e6f1b2a38e03..a0438b0f0805 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -154,10 +154,9 @@ out_err: if (evlist) { evlist__disable(evlist); evlist__delete(evlist); - } else { - perf_cpu_map__put(cpus); - perf_thread_map__put(threads); } + perf_cpu_map__put(cpus); + perf_thread_map__put(threads); return err; } diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 57093aeacc6f..73ae8f7aa066 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -158,8 +158,6 @@ out_init: out_delete_evlist: evlist__delete(evlist); - cpus = NULL; - threads = NULL; out_free_cpus: perf_cpu_map__put(cpus); out_free_threads: diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 7cff02664d0e..680c3cffb128 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -167,6 +167,8 @@ next_event: out_err: evlist__delete(evlist); + perf_cpu_map__put(cpus); + perf_thread_map__put(threads); return err; } diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 0dbe3aa99853..8fd8a4ef97da 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -129,9 +129,6 @@ static bool samples_same(const struct perf_sample *s1, if (type & PERF_SAMPLE_WEIGHT) COMP(weight); - if (type & PERF_SAMPLE_WEIGHT_STRUCT) - COMP(ins_lat); - if (type & PERF_SAMPLE_DATA_SRC) COMP(data_src); @@ -245,7 +242,6 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .cgroup = 114, .data_page_size = 115, .code_page_size = 116, - .ins_lat = 117, .aux_sample = { .size = sizeof(aux_data), .data = (void *)aux_data, diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index e5b824dd08d9..58984380b211 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # daemon operations # SPDX-License-Identifier: GPL-2.0 @@ -140,10 +140,10 @@ test_list() base=BASE [session-size] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 [session-time] -run = -e task-clock +run = -e task-clock -m 1 sleep 10 EOF sed -i -e "s|BASE|${base}|" ${config} @@ -159,14 +159,14 @@ EOF # check 1st session # pid:size:-e cpu-clock:base/size:base/size/output:base/size/control:base/size/ack:0 local line=`perf daemon --config ${config} -x: | head -2 | tail -1` - check_line_other "${line}" size "-e cpu-clock" ${base}/session-size \ + check_line_other "${line}" size "-e cpu-clock -m 1 sleep 10" ${base}/session-size \ ${base}/session-size/output ${base}/session-size/control \ ${base}/session-size/ack "0" # check 2nd session # pid:time:-e task-clock:base/time:base/time/output:base/time/control:base/time/ack:0 local line=`perf daemon --config ${config} -x: | head -3 | tail -1` - check_line_other "${line}" time "-e task-clock" ${base}/session-time \ + check_line_other "${line}" time "-e task-clock -m 1 sleep 10" ${base}/session-time \ ${base}/session-time/output ${base}/session-time/control \ ${base}/session-time/ack "0" @@ -190,10 +190,10 @@ test_reconfig() base=BASE [session-size] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 [session-time] -run = -e task-clock +run = -e task-clock -m 1 sleep 10 EOF sed -i -e "s|BASE|${base}|" ${config} @@ -204,7 +204,7 @@ EOF # check 2nd session # pid:time:-e task-clock:base/time:base/time/output:base/time/control:base/time/ack:0 local line=`perf daemon --config ${config} -x: | head -3 | tail -1` - check_line_other "${line}" time "-e task-clock" ${base}/session-time \ + check_line_other "${line}" time "-e task-clock -m 1 sleep 10" ${base}/session-time \ ${base}/session-time/output ${base}/session-time/control ${base}/session-time/ack "0" local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'` @@ -215,10 +215,10 @@ EOF base=BASE [session-size] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 [session-time] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 EOF # TEST 1 - change config @@ -238,7 +238,7 @@ EOF # check reconfigured 2nd session # pid:time:-e task-clock:base/time:base/time/output:base/time/control:base/time/ack:0 local line=`perf daemon --config ${config} -x: | head -3 | tail -1` - check_line_other "${line}" time "-e cpu-clock" ${base}/session-time \ + check_line_other "${line}" time "-e cpu-clock -m 1 sleep 10" ${base}/session-time \ ${base}/session-time/output ${base}/session-time/control ${base}/session-time/ack "0" # TEST 2 - empty config @@ -309,10 +309,10 @@ test_stop() base=BASE [session-size] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 [session-time] -run = -e task-clock +run = -e task-clock -m 1 sleep 10 EOF sed -i -e "s|BASE|${base}|" ${config} @@ -361,7 +361,7 @@ test_signal() base=BASE [session-test] -run = -e cpu-clock --switch-output +run = -e cpu-clock --switch-output -m 1 sleep 10 EOF sed -i -e "s|BASE|${base}|" ${config} @@ -400,10 +400,10 @@ test_ping() base=BASE [session-size] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 [session-time] -run = -e task-clock +run = -e task-clock -m 1 sleep 10 EOF sed -i -e "s|BASE|${base}|" ${config} @@ -439,7 +439,7 @@ test_lock() base=BASE [session-size] -run = -e cpu-clock +run = -e cpu-clock -m 1 sleep 10 EOF sed -i -e "s|BASE|${base}|" ${config} diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index a49c9e23053b..74988846be1d 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -42,8 +42,8 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) .disabled = 1, .freq = 1, }; - struct perf_cpu_map *cpus; - struct perf_thread_map *threads; + struct perf_cpu_map *cpus = NULL; + struct perf_thread_map *threads = NULL; struct mmap *md; attr.sample_freq = 500; @@ -66,14 +66,11 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) if (!cpus || !threads) { err = -ENOMEM; pr_debug("Not enough memory to create thread/cpu maps\n"); - goto out_free_maps; + goto out_delete_evlist; } perf_evlist__set_maps(&evlist->core, cpus, threads); - cpus = NULL; - threads = NULL; - if (evlist__open(evlist)) { const char *knob = "/proc/sys/kernel/perf_event_max_sample_rate"; @@ -129,10 +126,9 @@ out_init: err = -1; } -out_free_maps: +out_delete_evlist: perf_cpu_map__put(cpus); perf_thread_map__put(threads); -out_delete_evlist: evlist__delete(evlist); return err; } diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index 15a2ab765d89..3ebaa758df77 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -574,10 +574,9 @@ out: if (evlist) { evlist__disable(evlist); evlist__delete(evlist); - } else { - perf_cpu_map__put(cpus); - perf_thread_map__put(threads); } + perf_cpu_map__put(cpus); + perf_thread_map__put(threads); return err; diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index bbf94e4aa145..4c2969db59b0 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -75,14 +75,11 @@ int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused if (!cpus || !threads) { err = -ENOMEM; pr_debug("Not enough memory to create thread/cpu maps\n"); - goto out_free_maps; + goto out_delete_evlist; } perf_evlist__set_maps(&evlist->core, cpus, threads); - cpus = NULL; - threads = NULL; - err = evlist__prepare_workload(evlist, &target, argv, false, workload_exec_failed_signal); if (err < 0) { pr_debug("Couldn't run the workload!\n"); @@ -137,7 +134,7 @@ out_init: if (retry_count++ > 1000) { pr_debug("Failed after retrying 1000 times\n"); err = -1; - goto out_free_maps; + goto out_delete_evlist; } goto retry; @@ -148,10 +145,9 @@ out_init: err = -1; } -out_free_maps: +out_delete_evlist: perf_cpu_map__put(cpus); perf_thread_map__put(threads); -out_delete_evlist: evlist__delete(evlist); return err; } diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c index 28f51c4bd373..d1e208b4a571 100644 --- a/tools/perf/tests/thread-map.c +++ b/tools/perf/tests/thread-map.c @@ -102,6 +102,7 @@ int test__thread_map_synthesize(struct test *test __maybe_unused, int subtest __ TEST_ASSERT_VAL("failed to synthesize map", !perf_event__synthesize_thread_map2(NULL, threads, process_event, NULL)); + perf_thread_map__put(threads); return 0; } @@ -109,12 +110,12 @@ int test__thread_map_remove(struct test *test __maybe_unused, int subtest __mayb { struct perf_thread_map *threads; char *str; - int i; TEST_ASSERT_VAL("failed to allocate map string", asprintf(&str, "%d,%d", getpid(), getppid()) >= 0); threads = thread_map__new_str(str, NULL, 0, false); + free(str); TEST_ASSERT_VAL("failed to allocate thread_map", threads); @@ -141,9 +142,6 @@ int test__thread_map_remove(struct test *test __maybe_unused, int subtest __mayb TEST_ASSERT_VAL("failed to not remove thread", thread_map__remove(threads, 0)); - for (i = 0; i < threads->nr; i++) - zfree(&threads->map[i].comm); - - free(threads); + perf_thread_map__put(threads); return 0; } diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 953f4afacd3b..5b6ccb90b397 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -298,10 +298,6 @@ static int auxtrace_queues__queue_buffer(struct auxtrace_queues *queues, queue->set = true; queue->tid = buffer->tid; queue->cpu = buffer->cpu; - } else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) { - pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n", - queue->cpu, queue->tid, buffer->cpu, buffer->tid); - return -EINVAL; } buffer->buffer_nr = queues->next_buffer_nr++; diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 57d58c81a5f8..cdecda1ddd36 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -196,25 +196,32 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, } if (info_linear->info_len < offsetof(struct bpf_prog_info, prog_tags)) { + free(info_linear); pr_debug("%s: the kernel is too old, aborting\n", __func__); return -2; } info = &info_linear->info; + if (!info->jited_ksyms) { + free(info_linear); + return -1; + } /* number of ksyms, func_lengths, and tags should match */ sub_prog_cnt = info->nr_jited_ksyms; if (sub_prog_cnt != info->nr_prog_tags || - sub_prog_cnt != info->nr_jited_func_lens) + sub_prog_cnt != info->nr_jited_func_lens) { + free(info_linear); return -1; + } /* check BTF func info support */ if (info->btf_id && info->nr_func_info && info->func_info_rec_size) { /* btf func info number should be same as sub_prog_cnt */ if (sub_prog_cnt != info->nr_func_info) { pr_debug("%s: mismatch in BPF sub program count and BTF function info count, aborting\n", __func__); - err = -1; - goto out; + free(info_linear); + return -1; } if (btf__get_from_id(info->btf_id, &btf)) { pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 5121b4db66fe..882cd1f721d9 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -1306,6 +1306,7 @@ void evlist__close(struct evlist *evlist) perf_evsel__free_fd(&evsel->core); perf_evsel__free_id(&evsel->core); } + perf_evlist__reset_id_hash(&evlist->core); } static int evlist__create_syswide_maps(struct evlist *evlist) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1bf76864c4f2..7ecbc8e2fbfa 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -46,6 +46,7 @@ #include "string2.h" #include "memswap.h" #include "util.h" +#include "hashmap.h" #include "../perf-sys.h" #include "util/parse-branch-options.h" #include <internal/xyarray.h> @@ -1390,7 +1391,9 @@ void evsel__exit(struct evsel *evsel) zfree(&evsel->group_name); zfree(&evsel->name); zfree(&evsel->pmu_name); - zfree(&evsel->per_pkg_mask); + evsel__zero_per_pkg(evsel); + hashmap__free(evsel->per_pkg_mask); + evsel->per_pkg_mask = NULL; zfree(&evsel->metric_events); perf_evsel__object.fini(evsel); } @@ -2781,3 +2784,16 @@ int evsel__store_ids(struct evsel *evsel, struct evlist *evlist) return store_evsel_ids(evsel, evlist); } + +void evsel__zero_per_pkg(struct evsel *evsel) +{ + struct hashmap_entry *cur; + size_t bkt; + + if (evsel->per_pkg_mask) { + hashmap__for_each_entry(evsel->per_pkg_mask, cur, bkt) + free((char *)cur->key); + + hashmap__clear(evsel->per_pkg_mask); + } +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 4e8e49fb7e9d..6026487353dd 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -19,6 +19,7 @@ struct perf_stat_evsel; union perf_event; struct bpf_counter_ops; struct target; +struct hashmap; typedef int (evsel__sb_cb_t)(union perf_event *event, void *data); @@ -112,7 +113,7 @@ struct evsel { bool merged_stat; bool reset_group; bool errored; - unsigned long *per_pkg_mask; + struct hashmap *per_pkg_mask; struct evsel *leader; struct list_head config_terms; int err; @@ -433,4 +434,5 @@ struct perf_env *evsel__env(struct evsel *evsel); int evsel__store_ids(struct evsel *evsel, struct evlist *evlist); +void evsel__zero_per_pkg(struct evsel *evsel); #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4fe9e2a54346..20effdff76ce 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1618,8 +1618,8 @@ static void print_clock_data(struct feat_fd *ff, FILE *fp) fprintf(fp, "# clockid: %s (%u)\n", clockid_name(clockid), clockid); fprintf(fp, "# reference time: %s = %ld.%06d (TOD) = %ld.%09ld (%s)\n", - tstr, tod_ns.tv_sec, (int) tod_ns.tv_usec, - clockid_ns.tv_sec, clockid_ns.tv_nsec, + tstr, (long) tod_ns.tv_sec, (int) tod_ns.tv_usec, + (long) clockid_ns.tv_sec, clockid_ns.tv_nsec, clockid_name(clockid)); } diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 692e56dc832e..fbc40a2c17d4 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -77,8 +77,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) if (strstarts(filename, "/system/lib/")) { char *ndk, *app; const char *arch; - size_t ndk_length; - size_t app_length; + int ndk_length, app_length; ndk = getenv("NDK_ROOT"); app = getenv("APP_PLATFORM"); @@ -106,8 +105,8 @@ static inline bool replace_android_lib(const char *filename, char *newfilename) if (new_length > PATH_MAX) return false; snprintf(newfilename, new_length, - "%s/platforms/%s/arch-%s/usr/lib/%s", - ndk, app, arch, libname); + "%.*s/platforms/%.*s/arch-%s/usr/lib/%s", + ndk_length, ndk, app_length, app, arch, libname); return true; } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 42c84adeb2fb..c0c0fab22cb8 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -356,6 +356,9 @@ __add_event(struct list_head *list, int *idx, struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) : cpu_list ? perf_cpu_map__new(cpu_list) : NULL; + if (pmu && attr->type == PERF_TYPE_RAW) + perf_pmu__warn_invalid_config(pmu, attr->config, name); + if (init_attr) event_attr_init(attr); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index d5b6aff82f21..d57ac86ce7ca 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -89,6 +89,7 @@ static void inc_group_count(struct list_head *list, %type <str> PE_EVENT_NAME %type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type <str> PE_DRV_CFG_TERM +%type <str> event_pmu_name %destructor { free ($$); } <str> %type <term> event_term %destructor { parse_events_term__delete ($$); } <term> @@ -272,8 +273,11 @@ event_def: event_pmu | event_legacy_raw sep_dc | event_bpf_file +event_pmu_name: +PE_NAME | PE_PMU_EVENT_PRE + event_pmu: -PE_NAME opt_pmu_config +event_pmu_name opt_pmu_config { struct parse_events_state *parse_state = _parse_state; struct parse_events_error *error = parse_state->error; diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 44ef28302fc7..46fd0f998484 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1812,3 +1812,36 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) return nr_caps; } + +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name) +{ + struct perf_pmu_format *format; + __u64 masks = 0, bits; + char buf[100]; + unsigned int i; + + list_for_each_entry(format, &pmu->format, list) { + if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG) + continue; + + for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS) + masks |= 1ULL << i; + } + + /* + * Kernel doesn't export any valid format bits. + */ + if (masks == 0) + return; + + bits = config & ~masks; + if (bits == 0) + return; + + bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf)); + + pr_warning("WARNING: event '%s' not valid (bits %s of config " + "'%llx' not supported by kernel)!\n", + name ?: "N/A", buf, config); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 8164388478c6..160b0f561771 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -123,4 +123,7 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval); int perf_pmu__caps_parse(struct perf_pmu *pmu); +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name); + #endif /* __PMU_H */ diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 71b753523fac..845dd46e3c61 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -36,3 +36,4 @@ util/symbol_fprintf.c util/units.c util/affinity.c util/rwsem.c +util/hashmap.c diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 0d5ad42812b9..552b590485bf 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -3140,7 +3140,7 @@ int output_field_add(struct perf_hpp_list *list, char *tok) if (strncasecmp(tok, sd->name, strlen(tok))) continue; - if (sort__mode != SORT_MODE__MEMORY) + if (sort__mode != SORT_MODE__BRANCH) return -EINVAL; return __sort_dimension__add_output(list, sd); @@ -3152,7 +3152,7 @@ int output_field_add(struct perf_hpp_list *list, char *tok) if (strncasecmp(tok, sd->name, strlen(tok))) continue; - if (sort__mode != SORT_MODE__BRANCH) + if (sort__mode != SORT_MODE__MEMORY) return -EINVAL; return __sort_dimension__add_output(list, sd); diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index cce7a76d6473..7f09cdaf5b60 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -983,7 +983,7 @@ static void print_interval(struct perf_stat_config *config, if (config->interval_clear) puts(CONSOLE_CLEAR); - sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, config->csv_sep); + sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep); if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) { switch (config->aggr_mode) { diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 5d8af29447f4..c400f8dde017 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -13,6 +13,7 @@ #include "evlist.h" #include "evsel.h" #include "thread_map.h" +#include "hashmap.h" #include <linux/zalloc.h> void update_stats(struct stats *stats, u64 val) @@ -277,18 +278,29 @@ void evlist__save_aggr_prev_raw_counts(struct evlist *evlist) } } -static void zero_per_pkg(struct evsel *counter) +static size_t pkg_id_hash(const void *__key, void *ctx __maybe_unused) { - if (counter->per_pkg_mask) - memset(counter->per_pkg_mask, 0, cpu__max_cpu()); + uint64_t *key = (uint64_t *) __key; + + return *key & 0xffffffff; +} + +static bool pkg_id_equal(const void *__key1, const void *__key2, + void *ctx __maybe_unused) +{ + uint64_t *key1 = (uint64_t *) __key1; + uint64_t *key2 = (uint64_t *) __key2; + + return *key1 == *key2; } static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals, int cpu, bool *skip) { - unsigned long *mask = counter->per_pkg_mask; + struct hashmap *mask = counter->per_pkg_mask; struct perf_cpu_map *cpus = evsel__cpus(counter); - int s; + int s, d, ret = 0; + uint64_t *key; *skip = false; @@ -299,7 +311,7 @@ static int check_per_pkg(struct evsel *counter, return 0; if (!mask) { - mask = zalloc(cpu__max_cpu()); + mask = hashmap__new(pkg_id_hash, pkg_id_equal, NULL); if (!mask) return -ENOMEM; @@ -321,8 +333,25 @@ static int check_per_pkg(struct evsel *counter, if (s < 0) return -1; - *skip = test_and_set_bit(s, mask) == 1; - return 0; + /* + * On multi-die system, die_id > 0. On no-die system, die_id = 0. + * We use hashmap(socket, die) to check the used socket+die pair. + */ + d = cpu_map__get_die(cpus, cpu, NULL).die; + if (d < 0) + return -1; + + key = malloc(sizeof(*key)); + if (!key) + return -ENOMEM; + + *key = (uint64_t)d << 32 | s; + if (hashmap__find(mask, (void *)key, NULL)) + *skip = true; + else + ret = hashmap__add(mask, (void *)key, (void *)1); + + return ret; } static int @@ -422,7 +451,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, } if (counter->per_pkg) - zero_per_pkg(counter); + evsel__zero_per_pkg(counter); ret = process_counter_maps(config, counter); if (ret) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b698046ec2db..dff178103ce5 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -424,7 +424,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, while (!io.eof) { static const char anonstr[] = "//anon"; - size_t size; + size_t size, aligned_size; /* ensure null termination since stack will be reused. */ event->mmap2.filename[0] = '\0'; @@ -484,11 +484,12 @@ out: } size = strlen(event->mmap2.filename) + 1; - size = PERF_ALIGN(size, sizeof(u64)); + aligned_size = PERF_ALIGN(size, sizeof(u64)); event->mmap2.len -= event->mmap.start; event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); + (sizeof(event->mmap2.filename) - aligned_size)); + memset(event->mmap2.filename + size, 0, machine->id_hdr_size + + (aligned_size - size)); event->mmap2.header.size += machine->id_hdr_size; event->mmap2.pid = tgid; event->mmap2.tid = pid; @@ -758,7 +759,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, for (i = 0; i < n; i++) { char *end; pid_t _pid; - bool kernel_thread; + bool kernel_thread = false; _pid = strtol(dirent[i]->d_name, &end, 10); if (*end) diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index f507dff713c9..8a01af783310 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -361,6 +361,7 @@ static int read_saved_cmdline(struct tep_handle *pevent) pr_debug("error reading saved cmdlines\n"); goto out; } + buf[ret] = '\0'; parse_saved_cmdline(pevent, buf, size); ret = 0; diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index 3cc91ad048ea..43beb169631d 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -133,6 +133,8 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s if (dso != NULL) { __dsos__add(&machine->dsos, dso); dso__set_long_name(dso, long_name, false); + /* Put dso here because __dsos_add already got it */ + dso__put(dso); } return dso; diff --git a/tools/testing/kunit/configs/broken_on_uml.config b/tools/testing/kunit/configs/broken_on_uml.config index a7f0603d33f6..690870043ac0 100644 --- a/tools/testing/kunit/configs/broken_on_uml.config +++ b/tools/testing/kunit/configs/broken_on_uml.config @@ -40,3 +40,5 @@ # CONFIG_RESET_BRCMSTB_RESCAL is not set # CONFIG_RESET_INTEL_GW is not set # CONFIG_ADI_AXI_ADC is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_POISONING is not set diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py index 0b550cbd667d..1e2683dcc0e7 100644 --- a/tools/testing/kunit/kunit_config.py +++ b/tools/testing/kunit/kunit_config.py @@ -13,7 +13,7 @@ from typing import List, Set CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$' CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$' -KconfigEntryBase = collections.namedtuple('KconfigEntry', ['name', 'value']) +KconfigEntryBase = collections.namedtuple('KconfigEntryBase', ['name', 'value']) class KconfigEntry(KconfigEntryBase): diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index 3b796dd5e577..ca24f6839d50 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -296,21 +296,34 @@ static void *idr_throbber(void *arg) return NULL; } +/* + * There are always either 1 or 2 objects in the IDR. If we find nothing, + * or we find something at an ID we didn't expect, that's a bug. + */ void idr_find_test_1(int anchor_id, int throbber_id) { pthread_t throbber; time_t start = time(NULL); - pthread_create(&throbber, NULL, idr_throbber, &throbber_id); - BUG_ON(idr_alloc(&find_idr, xa_mk_value(anchor_id), anchor_id, anchor_id + 1, GFP_KERNEL) != anchor_id); + pthread_create(&throbber, NULL, idr_throbber, &throbber_id); + + rcu_read_lock(); do { int id = 0; void *entry = idr_get_next(&find_idr, &id); - BUG_ON(entry != xa_mk_value(id)); + rcu_read_unlock(); + if ((id != anchor_id && id != throbber_id) || + entry != xa_mk_value(id)) { + printf("%s(%d, %d): %p at %d\n", __func__, anchor_id, + throbber_id, entry, id); + abort(); + } + rcu_read_lock(); } while (time(NULL) < start + 11); + rcu_read_unlock(); pthread_join(throbber, NULL); @@ -577,6 +590,7 @@ void ida_tests(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); idr_checks(); ida_tests(); @@ -584,5 +598,6 @@ int __weak main(void) rcu_barrier(); if (nr_allocated) printf("nr_allocated = %d\n", nr_allocated); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/radix-tree/linux/compiler_types.h b/tools/testing/radix-tree/linux/compiler_types.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/tools/testing/radix-tree/linux/compiler_types.h +++ /dev/null diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 9eae0fb5a67d..e00520cc6349 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -224,7 +224,9 @@ void multiorder_checks(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); multiorder_checks(); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index e61e43efe463..f20e12cbbfd4 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -25,11 +25,13 @@ void xarray_tests(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); xarray_tests(); radix_tree_cpu_dead(1); rcu_barrier(); if (nr_allocated) printf("nr_allocated = %d\n", nr_allocated); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 41f0a0adbb80..6c575cf34a71 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -33,6 +33,7 @@ TARGETS += memfd TARGETS += memory-hotplug TARGETS += mincore TARGETS += mount +TARGETS += mount_setattr TARGETS += mqueue TARGETS += nci TARGETS += net diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b2282be6f938..612d3899614a 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -332,5 +332,5 @@ int main(void) ksft_print_cnts(); - return 0; + return ret; } diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index 9210691aa998..e3e08d9c7020 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -284,16 +284,28 @@ endfunction // Set up test pattern in the FFR // x0: pid // x2: generation +// +// We need to generate a canonical FFR value, which consists of a number of +// low "1" bits, followed by a number of zeros. This gives us 17 unique values +// per 16 bits of FFR, so we create a 4 bit signature out of the PID and +// generation, and use that as the initial number of ones in the pattern. +// We fill the upper lanes of FFR with zeros. // Beware: corrupts P0. function setup_ffr mov x4, x30 - bl pattern + and w0, w0, #0x3 + bfi w0, w2, #2, #2 + mov w1, #1 + lsl w1, w1, w0 + sub w1, w1, #1 + ldr x0, =ffrref - ldr x1, =scratch - rdvl x2, #1 - lsr x2, x2, #3 - bl memcpy + strh w1, [x0], 2 + rdvl x1, #1 + lsr x1, x1, #3 + sub x1, x1, #2 + bl memclr mov x0, #0 ldr x1, =ffrref diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index 36af1c138faf..b62a39315336 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -128,6 +128,8 @@ static void test_check_mtu_xdp(__u32 mtu, __u32 ifindex) test_check_mtu_run_xdp(skel, skel->progs.xdp_use_helper, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_exceed_mtu, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_minus_delta, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); @@ -187,6 +189,8 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu_da, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c new file mode 100644 index 000000000000..6c4d42a2386f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <sched.h> +#include <test_progs.h> +#include <time.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include "fexit_sleep.skel.h" + +static int do_sleep(void *skel) +{ + struct fexit_sleep *fexit_skel = skel; + struct timespec ts1 = { .tv_nsec = 1 }; + struct timespec ts2 = { .tv_sec = 10 }; + + fexit_skel->bss->pid = getpid(); + (void)syscall(__NR_nanosleep, &ts1, NULL); + (void)syscall(__NR_nanosleep, &ts2, NULL); + return 0; +} + +#define STACK_SIZE (1024 * 1024) +static char child_stack[STACK_SIZE]; + +void test_fexit_sleep(void) +{ + struct fexit_sleep *fexit_skel = NULL; + int wstatus, duration = 0; + pid_t cpid; + int err, fexit_cnt; + + fexit_skel = fexit_sleep__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; + + err = fexit_sleep__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; + + cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); + if (CHECK(cpid == -1, "clone", strerror(errno))) + goto cleanup; + + /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ + while (READ_ONCE(fexit_skel->bss->fentry_cnt) != 2); + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + + /* close progs and detach them. That will trigger two nop5->jmp5 rewrites + * in the trampolines to skip nanosleep_fexit prog. + * The nanosleep_fentry prog will get detached first. + * The nanosleep_fexit prog will get detached second. + * Detaching will trigger freeing of both progs JITed images. + * There will be two dying bpf_tramp_image-s, but only the initial + * bpf_tramp_image (with both _fentry and _fexit progs will be stuck + * waiting for percpu_ref_kill to confirm). The other one + * will be freed quickly. + */ + close(bpf_program__fd(fexit_skel->progs.nanosleep_fentry)); + close(bpf_program__fd(fexit_skel->progs.nanosleep_fexit)); + fexit_sleep__detach(fexit_skel); + + /* kill the thread to unwind sys_nanosleep stack through the trampoline */ + kill(cpid, 9); + + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + goto cleanup; + if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) + goto cleanup; + + /* The bypassed nanosleep_fexit prog shouldn't have executed. + * Unlike progs the maps were not freed and directly accessible. + */ + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + +cleanup: + fexit_sleep__destroy(fexit_skel); +} diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 31975c96e2c9..3ac0c9afc35a 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -174,6 +174,12 @@ struct struct_in_struct { }; }; +struct struct_in_array {}; + +struct struct_in_array_typed {}; + +typedef struct struct_in_array_typed struct_in_array_t[2]; + struct struct_with_embedded_stuff { int a; struct { @@ -203,6 +209,8 @@ struct struct_with_embedded_stuff { } r[5]; struct struct_in_struct s[10]; int t[11]; + struct struct_in_array (*u)[2]; + struct_in_array_t *v; }; struct root_struct { diff --git a/tools/testing/selftests/bpf/progs/fexit_sleep.c b/tools/testing/selftests/bpf/progs/fexit_sleep.c new file mode 100644 index 000000000000..03a672d76353 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fexit_sleep.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char LICENSE[] SEC("license") = "GPL"; + +int pid = 0; +int fentry_cnt = 0; +int fexit_cnt = 0; + +SEC("fentry/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fentry_cnt++; + return 0; +} + +SEC("fexit/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fexit, const struct pt_regs *regs, int ret) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fexit_cnt++; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/netif_receive_skb.c b/tools/testing/selftests/bpf/progs/netif_receive_skb.c index 6b670039ea67..1d8918dfbd3f 100644 --- a/tools/testing/selftests/bpf/progs/netif_receive_skb.c +++ b/tools/testing/selftests/bpf/progs/netif_receive_skb.c @@ -16,6 +16,13 @@ bool skip = false; #define STRSIZE 2048 #define EXPECTED_STRSIZE 256 +#if defined(bpf_target_s390) +/* NULL points to a readable struct lowcore on s390, so take the last page */ +#define BADPTR ((void *)0xFFFFFFFFFFFFF000ULL) +#else +#define BADPTR 0 +#endif + #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif @@ -113,11 +120,11 @@ int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb) } /* Check invalid ptr value */ - p.ptr = 0; + p.ptr = BADPTR; __ret = bpf_snprintf_btf(str, STRSIZE, &p, sizeof(p), 0); if (__ret >= 0) { - bpf_printk("printing NULL should generate error, got (%d)", - __ret); + bpf_printk("printing %llx should generate error, got (%d)", + (unsigned long long)BADPTR, __ret); ret = -ERANGE; } diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index b7787b43f9db..c4a9bae96e75 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -105,6 +105,54 @@ int xdp_minus_delta(struct xdp_md *ctx) return retval; } +SEC("xdp") +int xdp_input_len(struct xdp_md *ctx) +{ + int retval = XDP_PASS; /* Expected retval on successful test */ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input len is L3, like MTU and iph->tot_len. + * Remember XDP data_len is L2. + */ + __u32 mtu_len = data_len - ETH_HLEN; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = XDP_ABORTED; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("xdp") +int xdp_input_len_exceed(struct xdp_md *ctx) +{ + int retval = XDP_ABORTED; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = XDP_PASS ; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + SEC("classifier") int tc_use_helper(struct __sk_buff *ctx) { @@ -196,3 +244,47 @@ int tc_minus_delta(struct __sk_buff *ctx) global_bpf_mtu_xdp = mtu_len; return retval; } + +SEC("classifier") +int tc_input_len(struct __sk_buff *ctx) +{ + int retval = BPF_OK; /* Expected retval on successful test */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = BPF_DROP; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("classifier") +int tc_input_len_exceed(struct __sk_buff *ctx) +{ + int retval = BPF_DROP; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = BPF_OK; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func11.c b/tools/testing/selftests/bpf/progs/test_global_func11.c index 28488047c849..ef5277d982d9 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func11.c +++ b/tools/testing/selftests/bpf/progs/test_global_func11.c @@ -15,5 +15,5 @@ __noinline int foo(const struct S *s) SEC("cgroup_skb/ingress") int test_cls(struct __sk_buff *skb) { - return foo(skb); + return foo((const void *)skb); } diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index a621b58ab079..ba6eadfec565 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -446,10 +446,8 @@ int _geneve_get_tunnel(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } + if (ret < 0) + gopt.opt_class = 0; bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4, gopt.opt_class); @@ -510,10 +508,8 @@ int _ip6geneve_get_tunnel(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } + if (ret < 0) + gopt.opt_class = 0; bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4, gopt.opt_class); diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index bed53b561e04..1b138cd2b187 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -250,12 +250,13 @@ BPF_MOV64_IMM(BPF_REG_5, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .fixup_map_array_ro = { 3 }, .result = ACCEPT, - .retval = -29, + .retval = 65507, }, { "invalid write map access into a read-only array 1", diff --git a/tools/testing/selftests/bpf/verifier/atomic_and.c b/tools/testing/selftests/bpf/verifier/atomic_and.c index 1bdc8e6684f7..fe4bb70eb9c5 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_and.c +++ b/tools/testing/selftests/bpf/verifier/atomic_and.c @@ -75,3 +75,26 @@ }, .result = ACCEPT, }, +{ + "BPF_ATOMIC_AND with fetch - r0 as source reg", + .insns = { + /* val = 0x110; */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0x110), + /* old = atomic_fetch_and(&val, 0x011); */ + BPF_MOV64_IMM(BPF_REG_0, 0x011), + BPF_ATOMIC_OP(BPF_DW, BPF_AND | BPF_FETCH, BPF_REG_10, BPF_REG_0, -8), + /* if (old != 0x110) exit(3); */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0x110, 2), + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* if (val != 0x010) exit(2); */ + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x010, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c index 2efd8bcf57a1..6e52dfc64415 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c +++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c @@ -94,3 +94,28 @@ .result = REJECT, .errstr = "invalid read from stack", }, +{ + "BPF_W cmpxchg should zero top 32 bits", + .insns = { + /* r0 = U64_MAX; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + /* u64 val = r0; */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + /* r0 = (u32)atomic_cmpxchg((u32 *)&val, r0, 1); */ + BPF_MOV32_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_W, BPF_CMPXCHG, BPF_REG_10, BPF_REG_1, -8), + /* r1 = 0x00000000FFFFFFFFull; */ + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), + /* if (r0 != r1) exit(1); */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_1, 2), + BPF_MOV32_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/atomic_or.c b/tools/testing/selftests/bpf/verifier/atomic_or.c index 70f982e1f9f0..9d0716ac5080 100644 --- a/tools/testing/selftests/bpf/verifier/atomic_or.c +++ b/tools/testing/selftests/bpf/verifier/atomic_or.c @@ -75,3 +75,28 @@ }, .result = ACCEPT, }, +{ + "BPF_W atomic_fetch_or should zero top 32 bits", + .insns = { + /* r1 = U64_MAX; */ + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), + /* u64 val = r1; */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + /* r1 = (u32)atomic_fetch_or((u32 *)&val, 2); */ + BPF_MOV32_IMM(BPF_REG_1, 2), + BPF_ATOMIC_OP(BPF_W, BPF_OR | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* r2 = 0x00000000FFFFFFFF; */ + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 1), + /* if (r2 != r1) exit(1); */ + BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_1, 2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + /* exit(0); */ + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/bpf/verifier/bounds_deduction.c b/tools/testing/selftests/bpf/verifier/bounds_deduction.c index 1fd07a4f27ac..c162498a64fc 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_deduction.c +++ b/tools/testing/selftests/bpf/verifier/bounds_deduction.c @@ -6,8 +6,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 2", @@ -20,6 +21,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, }, @@ -31,8 +34,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 4", @@ -45,6 +49,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -55,8 +61,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 6", @@ -67,8 +74,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 7", @@ -80,8 +88,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -94,8 +103,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -106,8 +116,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 10", @@ -119,6 +130,6 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index b117bdd3806d..6f610cfddae5 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -75,6 +75,8 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .result = ACCEPT, }, { @@ -91,5 +93,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index b018ad71e0a8..3e32400c4b44 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -497,7 +497,7 @@ .result = ACCEPT, }, { - "unpriv: adding of fp", + "unpriv: adding of fp, reg", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_1, 0), @@ -505,6 +505,19 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "unpriv: adding of fp, imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), + BPF_EXIT_INSN(), + }, .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", .result_unpriv = REJECT, .result = ACCEPT, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index ed4e76b24649..feb91266db39 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -169,7 +169,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps or paths", + .errstr_unpriv = "R2 tried to add from different maps, paths, or prohibited types", .retval = 0, }, { @@ -517,6 +517,27 @@ .retval = 0xabcdef12, }, { + "map access: value_ptr += N, value_ptr -= N known scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV32_IMM(BPF_REG_1, 0x12345678), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0x12345678, +}, +{ "map access: unknown scalar += value_ptr, 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index 537d65968c48..fb23ce9617ea 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -12,9 +12,12 @@ #include <sys/mman.h> #include <linux/types.h> +#define NSEC_PER_MSEC 1000000L + #define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) #define DMA_MAP_MAX_THREADS 1024 #define DMA_MAP_MAX_SECONDS 300 +#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) #define DMA_MAP_BIDIRECTIONAL 0 #define DMA_MAP_TO_DEVICE 1 @@ -36,7 +39,8 @@ struct map_benchmark { __s32 node; /* which numa node this benchmark will run on */ __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ - __u8 expansion[84]; /* For future use */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u8 expansion[80]; /* For future use */ }; int main(int argc, char **argv) @@ -46,12 +50,12 @@ int main(int argc, char **argv) /* default single thread, run 20 seconds on NUMA_NO_NODE */ int threads = 1, seconds = 20, node = -1; /* default dma mask 32bit, bidirectional DMA */ - int bits = 32, dir = DMA_MAP_BIDIRECTIONAL; + int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; int cmd = DMA_MAP_BENCHMARK; char *p; - while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) { + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) { switch (opt) { case 't': threads = atoi(optarg); @@ -68,6 +72,9 @@ int main(int argc, char **argv) case 'd': dir = atoi(optarg); break; + case 'x': + xdelay = atoi(optarg); + break; default: return -1; } @@ -85,6 +92,12 @@ int main(int argc, char **argv) exit(1); } + if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) { + fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n", + DMA_MAP_MAX_TRANS_DELAY); + exit(1); + } + /* suppose the mininum DMA zone is 1MB in the world */ if (bits < 20 || bits > 64) { fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); @@ -109,6 +122,8 @@ int main(int argc, char **argv) map.node = node; map.dma_bits = bits; map.dma_dir = dir; + map.dma_trans_ns = xdelay; + if (ioctl(fd, cmd, &map)) { perror("ioctl"); exit(1); diff --git a/tools/testing/selftests/gpio/.gitignore b/tools/testing/selftests/gpio/.gitignore index 4c69408f3e84..a4969f7ee020 100644 --- a/tools/testing/selftests/gpio/.gitignore +++ b/tools/testing/selftests/gpio/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -gpio-mockup-chardev +gpio-mockup-cdev diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 3a84394829ea..7bd7e776c266 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,10 +8,13 @@ /x86_64/debug_regs /x86_64/evmcs_test /x86_64/get_cpuid_test +/x86_64/get_msr_index_features /x86_64/kvm_pv_test +/x86_64/hyperv_clock /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test +/x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test @@ -33,6 +36,7 @@ /demand_paging_test /dirty_log_test /dirty_log_perf_test +/hardware_disable_test /kvm_create_max_vcpus /memslot_modification_stress_test /set_memory_region_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 8c8eda429576..67eebb53235f 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -39,12 +39,15 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test @@ -67,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test +TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test TEST_GEN_PROGS_x86_64 += set_memory_region_test diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c new file mode 100644 index 000000000000..5aadf84c91c0 --- /dev/null +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This test is intended to reproduce a crash that happens when + * kvm_arch_hardware_disable is called and it attempts to unregister the user + * return notifiers. + */ + +#define _GNU_SOURCE + +#include <fcntl.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdint.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/wait.h> + +#include <test_util.h> + +#include "kvm_util.h" + +#define VCPU_NUM 4 +#define SLEEPING_THREAD_NUM (1 << 4) +#define FORK_NUM (1ULL << 9) +#define DELAY_US_MAX 2000 +#define GUEST_CODE_PIO_PORT 4 + +sem_t *sem; + +/* Arguments for the pthreads */ +struct payload { + struct kvm_vm *vm; + uint32_t index; +}; + +static void guest_code(void) +{ + for (;;) + ; /* Some busy work */ + printf("Should not be reached.\n"); +} + +static void *run_vcpu(void *arg) +{ + struct payload *payload = (struct payload *)arg; + struct kvm_run *state = vcpu_state(payload->vm, payload->index); + + vcpu_run(payload->vm, payload->index); + + TEST_ASSERT(false, "%s: exited with reason %d: %s\n", + __func__, state->exit_reason, + exit_reason_str(state->exit_reason)); + pthread_exit(NULL); +} + +static void *sleeping_thread(void *arg) +{ + int fd; + + while (true) { + fd = open("/dev/null", O_RDWR); + close(fd); + } + TEST_ASSERT(false, "%s: exited\n", __func__); + pthread_exit(NULL); +} + +static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr, + void *(*f)(void *), void *arg) +{ + int r; + + r = pthread_create(thread, attr, f, arg); + TEST_ASSERT(r == 0, "%s: failed to create thread", __func__); +} + +static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set) +{ + int r; + + r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set); + TEST_ASSERT(r == 0, "%s: failed set affinity", __func__); +} + +static inline void check_join(pthread_t thread, void **retval) +{ + int r; + + r = pthread_join(thread, retval); + TEST_ASSERT(r == 0, "%s: failed to join thread", __func__); +} + +static void run_test(uint32_t run) +{ + struct kvm_vm *vm; + cpu_set_t cpu_set; + pthread_t threads[VCPU_NUM]; + pthread_t throw_away; + struct payload payloads[VCPU_NUM]; + void *b; + uint32_t i, j; + + CPU_ZERO(&cpu_set); + for (i = 0; i < VCPU_NUM; i++) + CPU_SET(i, &cpu_set); + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_create_irqchip(vm); + + pr_debug("%s: [%d] start vcpus\n", __func__, run); + for (i = 0; i < VCPU_NUM; ++i) { + vm_vcpu_add_default(vm, i, guest_code); + payloads[i].vm = vm; + payloads[i].index = i; + + check_create_thread(&threads[i], NULL, run_vcpu, + (void *)&payloads[i]); + check_set_affinity(threads[i], &cpu_set); + + for (j = 0; j < SLEEPING_THREAD_NUM; ++j) { + check_create_thread(&throw_away, NULL, sleeping_thread, + (void *)NULL); + check_set_affinity(throw_away, &cpu_set); + } + } + pr_debug("%s: [%d] all threads launched\n", __func__, run); + sem_post(sem); + for (i = 0; i < VCPU_NUM; ++i) + check_join(threads[i], &b); + /* Should not be reached */ + TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); +} + +int main(int argc, char **argv) +{ + uint32_t i; + int s, r; + pid_t pid; + + sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0); + sem_unlink("vm_sem"); + + for (i = 0; i < FORK_NUM; ++i) { + pid = fork(); + TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__); + if (pid == 0) + run_test(i); /* This function always exits */ + + pr_debug("%s: [%d] waiting semaphore\n", __func__, i); + sem_wait(sem); + r = (rand() % DELAY_US_MAX) + 1; + pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); + usleep(r); + r = waitpid(pid, &s, WNOHANG); + TEST_ASSERT(r != pid, + "%s: [%d] child exited unexpectedly status: [%d]", + __func__, i, s); + pr_debug("%s: [%d] killing child\n", __func__, i); + kill(pid, SIGKILL); + } + + sem_destroy(sem); + exit(0); +} diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2d7eb6989e83..0f4258eaa629 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -16,6 +16,7 @@ #include "sparsebit.h" +#define KVM_DEV_PATH "/dev/kvm" #define KVM_MAX_VCPUS 512 /* @@ -133,6 +134,7 @@ void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void *arg); void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); +int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index d787cb802b4a..b8849a1aca79 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -21,6 +21,8 @@ #define KVM_UTIL_PGS_PER_HUGEPG 512 #define KVM_UTIL_MIN_PFN 2 +static int vcpu_mmap_sz(void); + /* Aligns x up to the next multiple of size. Size must be a power of 2. */ static void *align(void *x, size_t size) { @@ -509,7 +511,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct vcpu *vcpu) vcpu->dirty_gfns = NULL; } - ret = munmap(vcpu->state, sizeof(*vcpu->state)); + ret = munmap(vcpu->state, vcpu_mmap_sz()); TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " "errno: %i", ret, errno); close(vcpu->fd); @@ -978,7 +980,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", vcpu_mmap_sz(), sizeof(*vcpu->state)); - vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state), + vcpu->state = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, " "vcpu id: %u errno: %i", vcpuid, errno); @@ -1695,11 +1697,16 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { int ret; - ret = ioctl(vm->fd, cmd, arg); + ret = _vm_ioctl(vm, cmd, arg); TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", cmd, ret, errno, strerror(errno)); } +int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +{ + return ioctl(vm->fd, cmd, arg); +} + /* * KVM system ioctl * diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 34465dc562d8..91ce1b5d480b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -10,8 +10,6 @@ #include "sparsebit.h" -#define KVM_DEV_PATH "/dev/kvm" - struct userspace_mem_region { struct kvm_userspace_memory_region region; struct sparsebit *unused_phy_pages; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index de0c76177d02..a8906e60a108 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -720,7 +720,8 @@ struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid) { struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct kvm_cpuid2 *cpuid; - int rc, max_ent; + int max_ent; + int rc = -1; TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c new file mode 100644 index 000000000000..cb953df4d7d0 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_GET_MSR_INDEX_LIST and + * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +static int kvm_num_index_msrs(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", + r); + + r = list->nmsrs; + free(list); + return r; +} + +static void test_get_msr_index(void) +{ + int old_res, res, kvm_fd, r; + struct kvm_msr_list *list; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + old_res = kvm_num_index_msrs(kvm_fd, 0); + TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); + + if (old_res != 1) { + res = kvm_num_index_msrs(kvm_fd, 1); + TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); + TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + } + + list = malloc(sizeof(*list) + old_res * sizeof(list->indices[0])); + list->nmsrs = old_res; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + + TEST_ASSERT(r == 0, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", + r); + TEST_ASSERT(list->nmsrs == old_res, "Expecting nmsrs to be identical"); + free(list); + + close(kvm_fd); +} + +static int kvm_num_feature_msrs(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST probe, r: %i", + r); + + r = list->nmsrs; + free(list); + return r; +} + +struct kvm_msr_list *kvm_get_msr_feature_list(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + + TEST_ASSERT(r == 0, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", + r); + + return list; +} + +static void test_get_msr_feature(void) +{ + int res, old_res, i, kvm_fd; + struct kvm_msr_list *feature_list; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + old_res = kvm_num_feature_msrs(kvm_fd, 0); + TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); + + if (old_res != 1) { + res = kvm_num_feature_msrs(kvm_fd, 1); + TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); + TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + } + + feature_list = kvm_get_msr_feature_list(kvm_fd, old_res); + TEST_ASSERT(old_res == feature_list->nmsrs, + "Unmatching number of msr indexes"); + + for (i = 0; i < feature_list->nmsrs; i++) + kvm_get_feature_msr(feature_list->indices[i]); + + free(feature_list); + close(kvm_fd); +} + +int main(int argc, char *argv[]) +{ + if (kvm_check_cap(KVM_CAP_GET_MSR_FEATURES)) + test_get_msr_feature(); + + test_get_msr_index(); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c new file mode 100644 index 000000000000..7f1d2765572c --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V clocksources + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 + +/* Simplified mul_u64_u64_shr() */ +static inline u64 mul_u64_u64_shr64(u64 a, u64 b) +{ + union { + u64 ll; + struct { + u32 low, high; + } l; + } rm, rn, rh, a0, b0; + u64 c; + + a0.ll = a; + b0.ll = b; + + rm.ll = (u64)a0.l.low * b0.l.high; + rn.ll = (u64)a0.l.high * b0.l.low; + rh.ll = (u64)a0.l.high * b0.l.high; + + rh.l.low = c = rm.l.high + rn.l.high + rh.l.low; + rh.l.high = (c >> 32) + rh.l.high; + + return rh.ll; +} + +static inline void nop_loop(void) +{ + int i; + + for (i = 0; i < 1000000; i++) + asm volatile("nop"); +} + +static inline void check_tsc_msr_rdtsc(void) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY); + GUEST_ASSERT(tsc_freq > 0); + + /* First, check MSR-based clocksource */ + r1 = rdtsc(); + t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + nop_loop(); + r2 = rdtsc(); + t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + GUEST_ASSERT(r2 > r1 && t2 > t1); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100); +} + +static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page) +{ + return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset; +} + +static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page) +{ + u64 r1, r2, t1, t2; + + /* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */ + t1 = get_tscpage_ts(tsc_page); + r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + /* 10 ms tolerance */ + GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000); + nop_loop(); + + t2 = get_tscpage_ts(tsc_page); + r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000); +} + +static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa) +{ + u64 tsc_scale, tsc_offset; + + /* Set Guest OS id to enable Hyper-V emulation */ + GUEST_SYNC(1); + wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48); + GUEST_SYNC(2); + + check_tsc_msr_rdtsc(); + + GUEST_SYNC(3); + + /* Set up TSC page is disabled state, check that it's clean */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + + GUEST_SYNC(4); + + /* Set up TSC page is enabled state */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1); + GUEST_ASSERT(tsc_page->tsc_sequence != 0); + + GUEST_SYNC(5); + + check_tsc_msr_tsc_page(tsc_page); + + GUEST_SYNC(6); + + tsc_offset = tsc_page->tsc_offset; + /* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */ + + GUEST_SYNC(7); + /* Sanity check TSC page timestamp, it should be close to 0 */ + GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000); + + GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset); + + nop_loop(); + + /* + * Enable Re-enlightenment and check that TSC page stays constant across + * KVM_SET_CLOCK. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1); + tsc_offset = tsc_page->tsc_offset; + tsc_scale = tsc_page->tsc_scale; + GUEST_SYNC(8); + GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset); + GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale); + + GUEST_SYNC(9); + + check_tsc_msr_tsc_page(tsc_page); + + /* + * Disable re-enlightenment and TSC page, check that KVM doesn't update + * it anymore. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0); + wrmsr(HV_X64_MSR_REFERENCE_TSC, 0); + memset(tsc_page, 0, sizeof(*tsc_page)); + + GUEST_SYNC(10); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + + GUEST_DONE(); +} + +#define VCPU_ID 0 + +static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY); + TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); + + /* First, check MSR-based clocksource */ + r1 = rdtsc(); + t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + nop_loop(); + r2 = rdtsc(); + t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + + TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100, + "Elapsed time does not match (MSR=%ld, TSC=%ld)", + (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq); +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + vm_vaddr_t tsc_page_gva; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + run = vcpu_state(vm, VCPU_ID); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + tsc_page_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + memset(addr_gpa2hva(vm, tsc_page_gva), 0x0, getpagesize()); + TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, + "TSC page has to be page aligned\n"); + vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); + + host_check_tsc_msr_rdtsc(vm); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + /* Keep in sync with guest_main() */ + TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d\n", + stage); + goto out; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, + "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + /* Reset kvmclock triggering TSC page update */ + if (stage == 7 || stage == 8 || stage == 10) { + struct kvm_clock_data clock = {0}; + + vm_ioctl(vm, KVM_SET_CLOCK, &clock); + } + } + +out: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c new file mode 100644 index 000000000000..12c558fc8074 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_SET_BOOT_CPU_ID works as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#define _GNU_SOURCE /* for program_invocation_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define N_VCPU 2 +#define VCPU_ID0 0 +#define VCPU_ID1 1 + +static uint32_t get_bsp_flag(void) +{ + return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; +} + +static void guest_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT(get_bsp_flag() != 0); + + GUEST_DONE(); +} + +static void guest_not_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT(get_bsp_flag() == 0); + + GUEST_DONE(); +} + +static void test_set_boot_busy(struct kvm_vm *vm) +{ + int res; + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0); + TEST_ASSERT(res == -1 && errno == EBUSY, + "KVM_SET_BOOT_CPU_ID set while running vm"); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct ucall uc; + int stage; + + for (stage = 0; stage < 2; stage++) { + + vcpu_run(vm, vcpuid); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + test_set_boot_busy(vm); + break; + case UCALL_DONE: + TEST_ASSERT(stage == 1, + "Expected GUEST_DONE in stage 2, got stage %d", + stage); + break; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } + } +} + +static struct kvm_vm *create_vm(void) +{ + struct kvm_vm *vm; + uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * 2; + uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; + + pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); + vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_create_irqchip(vm); + + return vm; +} + +static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) +{ + if (bsp_code) + vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu); + else + vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu); + + vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); +} + +static void run_vm_bsp(uint32_t bsp_vcpu) +{ + struct kvm_vm *vm; + bool is_bsp_vcpu1 = bsp_vcpu == VCPU_ID1; + + vm = create_vm(); + + if (is_bsp_vcpu1) + vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + + add_x86_vcpu(vm, VCPU_ID0, !is_bsp_vcpu1); + add_x86_vcpu(vm, VCPU_ID1, is_bsp_vcpu1); + + run_vcpu(vm, VCPU_ID0); + run_vcpu(vm, VCPU_ID1); + + kvm_vm_free(vm); +} + +static void check_set_bsp_busy(void) +{ + struct kvm_vm *vm; + int res; + + vm = create_vm(); + + add_x86_vcpu(vm, VCPU_ID0, true); + add_x86_vcpu(vm, VCPU_ID1, false); + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set after adding vcpu"); + + run_vcpu(vm, VCPU_ID0); + run_vcpu(vm, VCPU_ID1); + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set to a terminated vcpu"); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + if (!kvm_check_cap(KVM_CAP_SET_BOOT_CPU_ID)) { + print_skip("set_boot_cpu_id not available"); + return 0; + } + + run_vm_bsp(VCPU_ID0); + run_vm_bsp(VCPU_ID1); + run_vm_bsp(VCPU_ID0); + + check_set_bsp_busy(); +} diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 9246ea310587..804ff5ff022d 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -13,19 +13,27 @@ #include <stdint.h> #include <time.h> +#include <sched.h> +#include <sys/syscall.h> #define VCPU_ID 5 +#define SHINFO_REGION_GVA 0xc0000000ULL #define SHINFO_REGION_GPA 0xc0000000ULL #define SHINFO_REGION_SLOT 10 #define PAGE_SIZE 4096 #define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) +#define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + 0x20) + +#define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + 0x20) static struct kvm_vm *vm; #define XEN_HYPERCALL_MSR 0x40000000 +#define MIN_STEAL_TIME 50000 + struct pvclock_vcpu_time_info { u32 version; u32 pad0; @@ -43,11 +51,67 @@ struct pvclock_wall_clock { u32 nsec; } __attribute__((__packed__)); +struct vcpu_runstate_info { + uint32_t state; + uint64_t state_entry_time; + uint64_t time[4]; +}; + +#define RUNSTATE_running 0 +#define RUNSTATE_runnable 1 +#define RUNSTATE_blocked 2 +#define RUNSTATE_offline 3 + static void guest_code(void) { + struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR; + + /* Test having the host set runstates manually */ + GUEST_SYNC(RUNSTATE_runnable); + GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0); + GUEST_ASSERT(rs->state == 0); + + GUEST_SYNC(RUNSTATE_blocked); + GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0); + GUEST_ASSERT(rs->state == 0); + + GUEST_SYNC(RUNSTATE_offline); + GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0); + GUEST_ASSERT(rs->state == 0); + + /* Test runstate time adjust */ + GUEST_SYNC(4); + GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a); + GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b); + + /* Test runstate time set */ + GUEST_SYNC(5); + GUEST_ASSERT(rs->state_entry_time >= 0x8000); + GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0); + GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b); + GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a); + + /* sched_yield() should result in some 'runnable' time */ + GUEST_SYNC(6); + GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME); + GUEST_DONE(); } +static long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + fscanf(fp, "%ld %ld ", &val[0], &val[1]); + fclose(fp); + + return val[1]; +} + static int cmp_timespec(struct timespec *a, struct timespec *b) { if (a->tv_sec > b->tv_sec) @@ -66,12 +130,14 @@ int main(int argc, char *argv[]) { struct timespec min_ts, max_ts, vm_ts; - if (!(kvm_check_cap(KVM_CAP_XEN_HVM) & - KVM_XEN_HVM_CONFIG_SHARED_INFO) ) { + int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); + if (!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO) ) { print_skip("KVM_XEN_HVM_CONFIG_SHARED_INFO not available"); exit(KSFT_SKIP); } + bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); + clock_gettime(CLOCK_REALTIME, &min_ts); vm = vm_create_default(VCPU_ID, 0, (void *) guest_code); @@ -80,6 +146,7 @@ int main(int argc, char *argv[]) /* Map a region for the shared_info page */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0); + virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2, 0); struct kvm_xen_hvm_config hvmc = { .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, @@ -111,6 +178,17 @@ int main(int argc, char *argv[]) }; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock); + if (do_runstate_tests) { + struct kvm_xen_vcpu_attr st = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, + .u.gpa = RUNSTATE_ADDR, + }; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st); + } + + struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);; + rs->state = 0x5a; + for (;;) { volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; @@ -126,8 +204,56 @@ int main(int argc, char *argv[]) case UCALL_ABORT: TEST_FAIL("%s", (const char *)uc.args[0]); /* NOT REACHED */ - case UCALL_SYNC: + case UCALL_SYNC: { + struct kvm_xen_vcpu_attr rst; + long rundelay; + + /* If no runstate support, bail out early */ + if (!do_runstate_tests) + goto done; + + TEST_ASSERT(rs->state_entry_time == rs->time[0] + + rs->time[1] + rs->time[2] + rs->time[3], + "runstate times don't add up"); + + switch (uc.args[1]) { + case RUNSTATE_running...RUNSTATE_offline: + rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT; + rst.u.runstate.state = uc.args[1]; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); + break; + case 4: + rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST; + memset(&rst.u, 0, sizeof(rst.u)); + rst.u.runstate.state = (uint64_t)-1; + rst.u.runstate.time_blocked = + 0x5a - rs->time[RUNSTATE_blocked]; + rst.u.runstate.time_offline = + 0x6b6b - rs->time[RUNSTATE_offline]; + rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked - + rst.u.runstate.time_offline; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); + break; + + case 5: + rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA; + memset(&rst.u, 0, sizeof(rst.u)); + rst.u.runstate.state = RUNSTATE_running; + rst.u.runstate.state_entry_time = 0x6b6b + 0x5a; + rst.u.runstate.time_blocked = 0x6b6b; + rst.u.runstate.time_offline = 0x5a; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); + break; + case 6: + /* Yield until scheduler delay exceeds target */ + rundelay = get_run_delay() + MIN_STEAL_TIME; + do { + sched_yield(); + } while (get_run_delay() < rundelay); + break; + } break; + } case UCALL_DONE: goto done; default: @@ -162,6 +288,33 @@ int main(int argc, char *argv[]) TEST_ASSERT(ti2->version && !(ti2->version & 1), "Bad time_info version %x", ti->version); + if (do_runstate_tests) { + /* + * Fetch runstate and check sanity. Strictly speaking in the + * general case we might not expect the numbers to be identical + * but in this case we know we aren't running the vCPU any more. + */ + struct kvm_xen_vcpu_attr rst = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA, + }; + vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &rst); + + TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch"); + TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time, + "State entry time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running, + "Running time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable, + "Runnable time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked, + "Blocked time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline, + "Offline time mismatch"); + + TEST_ASSERT(rs->state_entry_time == rs->time[0] + + rs->time[1] + rs->time[2] + rs->time[3], + "runstate times don't add up"); + } kvm_vm_free(vm); return 0; } diff --git a/tools/testing/selftests/mount_setattr/.gitignore b/tools/testing/selftests/mount_setattr/.gitignore new file mode 100644 index 000000000000..5f74d8488472 --- /dev/null +++ b/tools/testing/selftests/mount_setattr/.gitignore @@ -0,0 +1 @@ +mount_setattr_test diff --git a/tools/testing/selftests/mount_setattr/Makefile b/tools/testing/selftests/mount_setattr/Makefile new file mode 100644 index 000000000000..2250f7dcb81e --- /dev/null +++ b/tools/testing/selftests/mount_setattr/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for mount selftests. +CFLAGS = -g -I../../../../usr/include/ -Wall -O2 -pthread + +TEST_GEN_FILES += mount_setattr_test + +include ../lib.mk diff --git a/tools/testing/selftests/mount_setattr/config b/tools/testing/selftests/mount_setattr/config new file mode 100644 index 000000000000..416bd53ce982 --- /dev/null +++ b/tools/testing/selftests/mount_setattr/config @@ -0,0 +1 @@ +CONFIG_USER_NS=y diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c new file mode 100644 index 000000000000..4e94e566e040 --- /dev/null +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -0,0 +1,1424 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sched.h> +#include <stdio.h> +#include <errno.h> +#include <pthread.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/wait.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/sysinfo.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <grp.h> +#include <stdbool.h> +#include <stdarg.h> + +#include "../kselftest_harness.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 +#endif + +#ifndef MS_REC +#define MS_REC 16384 +#endif + +#ifndef MS_RELATIME +#define MS_RELATIME (1 << 21) +#endif + +#ifndef MS_STRICTATIME +#define MS_STRICTATIME (1 << 24) +#endif + +#ifndef MOUNT_ATTR_RDONLY +#define MOUNT_ATTR_RDONLY 0x00000001 +#endif + +#ifndef MOUNT_ATTR_NOSUID +#define MOUNT_ATTR_NOSUID 0x00000002 +#endif + +#ifndef MOUNT_ATTR_NOEXEC +#define MOUNT_ATTR_NOEXEC 0x00000008 +#endif + +#ifndef MOUNT_ATTR_NODIRATIME +#define MOUNT_ATTR_NODIRATIME 0x00000080 +#endif + +#ifndef MOUNT_ATTR__ATIME +#define MOUNT_ATTR__ATIME 0x00000070 +#endif + +#ifndef MOUNT_ATTR_RELATIME +#define MOUNT_ATTR_RELATIME 0x00000000 +#endif + +#ifndef MOUNT_ATTR_NOATIME +#define MOUNT_ATTR_NOATIME 0x00000010 +#endif + +#ifndef MOUNT_ATTR_STRICTATIME +#define MOUNT_ATTR_STRICTATIME 0x00000020 +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 +#endif + +#ifndef MS_SHARED +#define MS_SHARED (1 << 20) +#endif + +#define DEFAULT_THREADS 4 +#define ptr_to_int(p) ((int)((intptr_t)(p))) +#define int_to_ptr(u) ((void *)((intptr_t)(u))) + +#ifndef __NR_mount_setattr + #if defined __alpha__ + #define __NR_mount_setattr 552 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_mount_setattr (442 + 4000) + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_mount_setattr (442 + 6000) + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_mount_setattr (442 + 5000) + #endif + #elif defined __ia64__ + #define __NR_mount_setattr (442 + 1024) + #else + #define __NR_mount_setattr 442 + #endif + +struct mount_attr { + __u64 attr_set; + __u64 attr_clr; + __u64 propagation; + __u64 userns_fd; +}; +#endif + +#ifndef __NR_open_tree + #if defined __alpha__ + #define __NR_open_tree 538 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_open_tree 4428 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_open_tree 6428 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_open_tree 5428 + #endif + #elif defined __ia64__ + #define __NR_open_tree (428 + 1024) + #else + #define __NR_open_tree 428 + #endif +#endif + +#ifndef MOUNT_ATTR_IDMAP +#define MOUNT_ATTR_IDMAP 0x00100000 +#endif + +static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flags, + struct mount_attr *attr, size_t size) +{ + return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); +} + +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +#endif + +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + +static ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = write(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +static int write_file(const char *path, const void *buf, size_t count) +{ + int fd; + ssize_t ret; + + fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW); + if (fd < 0) + return -1; + + ret = write_nointr(fd, buf, count); + close(fd); + if (ret < 0 || (size_t)ret != count) + return -1; + + return 0; +} + +static int create_and_enter_userns(void) +{ + uid_t uid; + gid_t gid; + char map[100]; + + uid = getuid(); + gid = getgid(); + + if (unshare(CLONE_NEWUSER)) + return -1; + + if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) && + errno != ENOENT) + return -1; + + snprintf(map, sizeof(map), "0 %d 1", uid); + if (write_file("/proc/self/uid_map", map, strlen(map))) + return -1; + + + snprintf(map, sizeof(map), "0 %d 1", gid); + if (write_file("/proc/self/gid_map", map, strlen(map))) + return -1; + + if (setgid(0)) + return -1; + + if (setuid(0)) + return -1; + + return 0; +} + +static int prepare_unpriv_mountns(void) +{ + if (create_and_enter_userns()) + return -1; + + if (unshare(CLONE_NEWNS)) + return -1; + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + return -1; + + return 0; +} + +static int read_mnt_flags(const char *path) +{ + int ret; + struct statvfs stat; + unsigned int mnt_flags; + + ret = statvfs(path, &stat); + if (ret != 0) + return -EINVAL; + + if (stat.f_flag & + ~(ST_RDONLY | ST_NOSUID | ST_NODEV | ST_NOEXEC | ST_NOATIME | + ST_NODIRATIME | ST_RELATIME | ST_SYNCHRONOUS | ST_MANDLOCK)) + return -EINVAL; + + mnt_flags = 0; + if (stat.f_flag & ST_RDONLY) + mnt_flags |= MS_RDONLY; + if (stat.f_flag & ST_NOSUID) + mnt_flags |= MS_NOSUID; + if (stat.f_flag & ST_NODEV) + mnt_flags |= MS_NODEV; + if (stat.f_flag & ST_NOEXEC) + mnt_flags |= MS_NOEXEC; + if (stat.f_flag & ST_NOATIME) + mnt_flags |= MS_NOATIME; + if (stat.f_flag & ST_NODIRATIME) + mnt_flags |= MS_NODIRATIME; + if (stat.f_flag & ST_RELATIME) + mnt_flags |= MS_RELATIME; + if (stat.f_flag & ST_SYNCHRONOUS) + mnt_flags |= MS_SYNCHRONOUS; + if (stat.f_flag & ST_MANDLOCK) + mnt_flags |= ST_MANDLOCK; + + return mnt_flags; +} + +static char *get_field(char *src, int nfields) +{ + int i; + char *p = src; + + for (i = 0; i < nfields; i++) { + while (*p && *p != ' ' && *p != '\t') + p++; + + if (!*p) + break; + + p++; + } + + return p; +} + +static void null_endofword(char *word) +{ + while (*word && *word != ' ' && *word != '\t') + word++; + *word = '\0'; +} + +static bool is_shared_mount(const char *path) +{ + size_t len = 0; + char *line = NULL; + FILE *f = NULL; + + f = fopen("/proc/self/mountinfo", "re"); + if (!f) + return false; + + while (getline(&line, &len, f) != -1) { + char *opts, *target; + + target = get_field(line, 4); + if (!target) + continue; + + opts = get_field(target, 2); + if (!opts) + continue; + + null_endofword(target); + + if (strcmp(target, path) != 0) + continue; + + null_endofword(opts); + if (strstr(opts, "shared:")) + return true; + } + + free(line); + fclose(f); + + return false; +} + +static void *mount_setattr_thread(void *data) +{ + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID, + .attr_clr = 0, + .propagation = MS_SHARED, + }; + + if (sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr))) + pthread_exit(int_to_ptr(-1)); + + pthread_exit(int_to_ptr(0)); +} + +/* Attempt to de-conflict with the selftests tree. */ +#ifndef SKIP +#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) +#endif + +static bool mount_setattr_supported(void) +{ + int ret; + + ret = sys_mount_setattr(-EBADF, "", AT_EMPTY_PATH, NULL, 0); + if (ret < 0 && errno == ENOSYS) + return false; + + return true; +} + +FIXTURE(mount_setattr) { +}; + +FIXTURE_SETUP(mount_setattr) +{ + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + ASSERT_EQ(prepare_unpriv_mountns(), 0); + + (void)umount2("/mnt", MNT_DETACH); + (void)umount2("/tmp", MNT_DETACH); + + ASSERT_EQ(mount("testing", "/tmp", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/tmp/B", 0777), 0); + + ASSERT_EQ(mount("testing", "/tmp/B", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/tmp/B/BB", 0777), 0); + + ASSERT_EQ(mount("testing", "/tmp/B/BB", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/mnt/A", 0777), 0); + + ASSERT_EQ(mount("testing", "/mnt/A", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/mnt/A/AA", 0777), 0); + + ASSERT_EQ(mount("/tmp", "/mnt/A/AA", NULL, MS_BIND | MS_REC, NULL), 0); + + ASSERT_EQ(mkdir("/mnt/B", 0777), 0); + + ASSERT_EQ(mount("testing", "/mnt/B", "ramfs", + MS_NOATIME | MS_NODEV | MS_NOSUID, 0), 0); + + ASSERT_EQ(mkdir("/mnt/B/BB", 0777), 0); + + ASSERT_EQ(mount("testing", "/tmp/B/BB", "devpts", + MS_RELATIME | MS_NOEXEC | MS_RDONLY, 0), 0); +} + +FIXTURE_TEARDOWN(mount_setattr) +{ + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + (void)umount2("/mnt/A", MNT_DETACH); + (void)umount2("/tmp", MNT_DETACH); +} + +TEST_F(mount_setattr, invalid_attributes) +{ + struct mount_attr invalid_attr = { + .attr_set = (1U << 31), + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr)), 0); + + invalid_attr.attr_set = 0; + invalid_attr.attr_clr = (1U << 31); + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr)), 0); + + invalid_attr.attr_clr = 0; + invalid_attr.propagation = (1U << 31); + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr)), 0); + + invalid_attr.attr_set = (1U << 31); + invalid_attr.attr_clr = (1U << 31); + invalid_attr.propagation = (1U << 31); + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr)), 0); + + ASSERT_NE(sys_mount_setattr(-1, "mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr)), 0); +} + +TEST_F(mount_setattr, extensibility) +{ + unsigned int old_flags = 0, new_flags = 0, expected_flags = 0; + char *s = "dummy"; + struct mount_attr invalid_attr = {}; + struct mount_attr_large { + struct mount_attr attr1; + struct mount_attr attr2; + struct mount_attr attr3; + } large_attr = {}; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, NULL, + sizeof(invalid_attr)), 0); + ASSERT_EQ(errno, EFAULT); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, (void *)s, + sizeof(invalid_attr)), 0); + ASSERT_EQ(errno, EINVAL); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, 0), 0); + ASSERT_EQ(errno, EINVAL); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr) / 2), 0); + ASSERT_EQ(errno, EINVAL); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &invalid_attr, + sizeof(invalid_attr) / 2), 0); + ASSERT_EQ(errno, EINVAL); + + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, + (void *)&large_attr, sizeof(large_attr)), 0); + + large_attr.attr3.attr_set = MOUNT_ATTR_RDONLY; + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, + (void *)&large_attr, sizeof(large_attr)), 0); + + large_attr.attr3.attr_set = 0; + large_attr.attr1.attr_set = MOUNT_ATTR_RDONLY; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, + (void *)&large_attr, sizeof(large_attr)), 0); + + expected_flags = old_flags; + expected_flags |= MS_RDONLY; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); +} + +TEST_F(mount_setattr, basic) +{ + unsigned int old_flags = 0, new_flags = 0, expected_flags = 0; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME, + .attr_clr = MOUNT_ATTR__ATIME, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", 0, &attr, sizeof(attr)), 0); + + expected_flags = old_flags; + expected_flags |= MS_RDONLY; + expected_flags |= MS_NOEXEC; + expected_flags &= ~MS_NOATIME; + expected_flags |= MS_RELATIME; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, old_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, old_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, old_flags); +} + +TEST_F(mount_setattr, basic_recursive) +{ + int fd; + unsigned int old_flags = 0, new_flags = 0, expected_flags = 0; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME, + .attr_clr = MOUNT_ATTR__ATIME, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags = old_flags; + expected_flags |= MS_RDONLY; + expected_flags |= MS_NOEXEC; + expected_flags &= ~MS_NOATIME; + expected_flags |= MS_RELATIME; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + memset(&attr, 0, sizeof(attr)); + attr.attr_clr = MOUNT_ATTR_RDONLY; + attr.propagation = MS_SHARED; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags &= ~MS_RDONLY; + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A"), true); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA"), true); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B"), true); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B/BB"), true); + + fd = open("/mnt/A/AA/B/b", O_RDWR | O_CLOEXEC | O_CREAT | O_EXCL, 0777); + ASSERT_GE(fd, 0); + + /* + * We're holding a fd open for writing so this needs to fail somewhere + * in the middle and the mount options need to be unchanged. + */ + attr.attr_set = MOUNT_ATTR_RDONLY; + ASSERT_LT(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A"), true); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA"), true); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B"), true); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B/BB"), true); + + EXPECT_EQ(close(fd), 0); +} + +TEST_F(mount_setattr, mount_has_writers) +{ + int fd, dfd; + unsigned int old_flags = 0, new_flags = 0; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME, + .attr_clr = MOUNT_ATTR__ATIME, + .propagation = MS_SHARED, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + fd = open("/mnt/A/AA/B/b", O_RDWR | O_CLOEXEC | O_CREAT | O_EXCL, 0777); + ASSERT_GE(fd, 0); + + /* + * We're holding a fd open to a mount somwhere in the middle so this + * needs to fail somewhere in the middle. After this the mount options + * need to be unchanged. + */ + ASSERT_LT(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, old_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A"), false); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, old_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA"), false); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, old_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B"), false); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, old_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B/BB"), false); + + dfd = open("/mnt/A/AA/B", O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(dfd, 0); + EXPECT_EQ(fsync(dfd), 0); + EXPECT_EQ(close(dfd), 0); + + EXPECT_EQ(fsync(fd), 0); + EXPECT_EQ(close(fd), 0); + + /* All writers are gone so this should succeed. */ + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); +} + +TEST_F(mount_setattr, mixed_mount_options) +{ + unsigned int old_flags1 = 0, old_flags2 = 0, new_flags = 0, expected_flags = 0; + struct mount_attr attr = { + .attr_clr = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME, + .attr_set = MOUNT_ATTR_RELATIME, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + old_flags1 = read_mnt_flags("/mnt/B"); + ASSERT_GT(old_flags1, 0); + + old_flags2 = read_mnt_flags("/mnt/B/BB"); + ASSERT_GT(old_flags2, 0); + + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/B", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags = old_flags2; + expected_flags &= ~(MS_RDONLY | MS_NOEXEC | MS_NOATIME | MS_NOSUID); + expected_flags |= MS_RELATIME; + + new_flags = read_mnt_flags("/mnt/B"); + ASSERT_EQ(new_flags, expected_flags); + + expected_flags = old_flags2; + expected_flags &= ~(MS_RDONLY | MS_NOEXEC | MS_NOATIME | MS_NOSUID); + expected_flags |= MS_RELATIME; + + new_flags = read_mnt_flags("/mnt/B/BB"); + ASSERT_EQ(new_flags, expected_flags); +} + +TEST_F(mount_setattr, time_changes) +{ + unsigned int old_flags = 0, new_flags = 0, expected_flags = 0; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_NODIRATIME | MOUNT_ATTR_NOATIME, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + attr.attr_set = MOUNT_ATTR_STRICTATIME; + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + attr.attr_set = MOUNT_ATTR_STRICTATIME | MOUNT_ATTR_NOATIME; + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + attr.attr_set = MOUNT_ATTR_STRICTATIME | MOUNT_ATTR_NOATIME; + attr.attr_clr = MOUNT_ATTR__ATIME; + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + attr.attr_set = 0; + attr.attr_clr = MOUNT_ATTR_STRICTATIME; + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + attr.attr_clr = MOUNT_ATTR_NOATIME; + ASSERT_NE(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + attr.attr_set = MOUNT_ATTR_NODIRATIME | MOUNT_ATTR_NOATIME; + attr.attr_clr = MOUNT_ATTR__ATIME; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags = old_flags; + expected_flags |= MS_NOATIME; + expected_flags |= MS_NODIRATIME; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + memset(&attr, 0, sizeof(attr)); + attr.attr_set &= ~MOUNT_ATTR_NOATIME; + attr.attr_set |= MOUNT_ATTR_RELATIME; + attr.attr_clr |= MOUNT_ATTR__ATIME; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags &= ~MS_NOATIME; + expected_flags |= MS_RELATIME; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + memset(&attr, 0, sizeof(attr)); + attr.attr_set &= ~MOUNT_ATTR_RELATIME; + attr.attr_set |= MOUNT_ATTR_STRICTATIME; + attr.attr_clr |= MOUNT_ATTR__ATIME; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags &= ~MS_RELATIME; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + memset(&attr, 0, sizeof(attr)); + attr.attr_set &= ~MOUNT_ATTR_STRICTATIME; + attr.attr_set |= MOUNT_ATTR_NOATIME; + attr.attr_clr |= MOUNT_ATTR__ATIME; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags |= MS_NOATIME; + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + memset(&attr, 0, sizeof(attr)); + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + memset(&attr, 0, sizeof(attr)); + attr.attr_clr = MOUNT_ATTR_NODIRATIME; + ASSERT_EQ(sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)), 0); + + expected_flags &= ~MS_NODIRATIME; + + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); +} + +TEST_F(mount_setattr, multi_threaded) +{ + int i, j, nthreads, ret = 0; + unsigned int old_flags = 0, new_flags = 0, expected_flags = 0; + pthread_attr_t pattr; + pthread_t threads[DEFAULT_THREADS]; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + old_flags = read_mnt_flags("/mnt/A"); + ASSERT_GT(old_flags, 0); + + /* Try to change mount options from multiple threads. */ + nthreads = get_nprocs_conf(); + if (nthreads > DEFAULT_THREADS) + nthreads = DEFAULT_THREADS; + + pthread_attr_init(&pattr); + for (i = 0; i < nthreads; i++) + ASSERT_EQ(pthread_create(&threads[i], &pattr, mount_setattr_thread, NULL), 0); + + for (j = 0; j < i; j++) { + void *retptr = NULL; + + EXPECT_EQ(pthread_join(threads[j], &retptr), 0); + + ret += ptr_to_int(retptr); + EXPECT_EQ(ret, 0); + } + pthread_attr_destroy(&pattr); + + ASSERT_EQ(ret, 0); + + expected_flags = old_flags; + expected_flags |= MS_RDONLY; + expected_flags |= MS_NOSUID; + new_flags = read_mnt_flags("/mnt/A"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A"), true); + + new_flags = read_mnt_flags("/mnt/A/AA"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA"), true); + + new_flags = read_mnt_flags("/mnt/A/AA/B"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B"), true); + + new_flags = read_mnt_flags("/mnt/A/AA/B/BB"); + ASSERT_EQ(new_flags, expected_flags); + + ASSERT_EQ(is_shared_mount("/mnt/A/AA/B/BB"), true); +} + +TEST_F(mount_setattr, wrong_user_namespace) +{ + int ret; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_RDONLY, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + EXPECT_EQ(create_and_enter_userns(), 0); + ret = sys_mount_setattr(-1, "/mnt/A", AT_RECURSIVE, &attr, sizeof(attr)); + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EPERM); +} + +TEST_F(mount_setattr, wrong_mount_namespace) +{ + int fd, ret; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_RDONLY, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + fd = open("/mnt/A", O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(fd, 0); + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + ret = sys_mount_setattr(fd, "", AT_EMPTY_PATH | AT_RECURSIVE, &attr, sizeof(attr)); + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); +} + +FIXTURE(mount_setattr_idmapped) { +}; + +FIXTURE_SETUP(mount_setattr_idmapped) +{ + int img_fd = -EBADF; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + ASSERT_EQ(mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0), 0); + + (void)umount2("/mnt", MNT_DETACH); + (void)umount2("/tmp", MNT_DETACH); + + ASSERT_EQ(mount("testing", "/tmp", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/tmp/B", 0777), 0); + ASSERT_EQ(mknodat(-EBADF, "/tmp/B/b", S_IFREG | 0644, 0), 0); + ASSERT_EQ(chown("/tmp/B/b", 0, 0), 0); + + ASSERT_EQ(mount("testing", "/tmp/B", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/tmp/B/BB", 0777), 0); + ASSERT_EQ(mknodat(-EBADF, "/tmp/B/BB/b", S_IFREG | 0644, 0), 0); + ASSERT_EQ(chown("/tmp/B/BB/b", 0, 0), 0); + + ASSERT_EQ(mount("testing", "/tmp/B/BB", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/mnt/A", 0777), 0); + + ASSERT_EQ(mount("testing", "/mnt/A", "tmpfs", MS_NOATIME | MS_NODEV, + "size=100000,mode=700"), 0); + + ASSERT_EQ(mkdir("/mnt/A/AA", 0777), 0); + + ASSERT_EQ(mount("/tmp", "/mnt/A/AA", NULL, MS_BIND | MS_REC, NULL), 0); + + ASSERT_EQ(mkdir("/mnt/B", 0777), 0); + + ASSERT_EQ(mount("testing", "/mnt/B", "ramfs", + MS_NOATIME | MS_NODEV | MS_NOSUID, 0), 0); + + ASSERT_EQ(mkdir("/mnt/B/BB", 0777), 0); + + ASSERT_EQ(mount("testing", "/tmp/B/BB", "devpts", + MS_RELATIME | MS_NOEXEC | MS_RDONLY, 0), 0); + + ASSERT_EQ(mkdir("/mnt/C", 0777), 0); + ASSERT_EQ(mkdir("/mnt/D", 0777), 0); + img_fd = openat(-EBADF, "/mnt/C/ext4.img", O_CREAT | O_WRONLY, 0600); + ASSERT_GE(img_fd, 0); + ASSERT_EQ(ftruncate(img_fd, 1024 * 2048), 0); + ASSERT_EQ(system("mkfs.ext4 -q /mnt/C/ext4.img"), 0); + ASSERT_EQ(system("mount -o loop -t ext4 /mnt/C/ext4.img /mnt/D/"), 0); + ASSERT_EQ(close(img_fd), 0); +} + +FIXTURE_TEARDOWN(mount_setattr_idmapped) +{ + (void)umount2("/mnt/A", MNT_DETACH); + (void)umount2("/tmp", MNT_DETACH); +} + +/** + * Validate that negative fd values are rejected. + */ +TEST_F(mount_setattr_idmapped, invalid_fd_negative) +{ + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = -EBADF, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + ASSERT_NE(sys_mount_setattr(-1, "/", 0, &attr, sizeof(attr)), 0) { + TH_LOG("failure: created idmapped mount with negative fd"); + } +} + +/** + * Validate that excessively large fd values are rejected. + */ +TEST_F(mount_setattr_idmapped, invalid_fd_large) +{ + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = INT64_MAX, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + ASSERT_NE(sys_mount_setattr(-1, "/", 0, &attr, sizeof(attr)), 0) { + TH_LOG("failure: created idmapped mount with too large fd value"); + } +} + +/** + * Validate that closed fd values are rejected. + */ +TEST_F(mount_setattr_idmapped, invalid_fd_closed) +{ + int fd; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0); + ASSERT_GE(close(fd), 0); + + attr.userns_fd = fd; + ASSERT_NE(sys_mount_setattr(-1, "/", 0, &attr, sizeof(attr)), 0) { + TH_LOG("failure: created idmapped mount with closed fd"); + } +} + +/** + * Validate that the initial user namespace is rejected. + */ +TEST_F(mount_setattr_idmapped, invalid_fd_initial_userns) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/D", + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(open_tree_fd, 0); + + attr.userns_fd = open("/proc/1/ns/user", O_RDONLY | O_CLOEXEC); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(errno, EPERM); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); +} + +static int map_ids(pid_t pid, unsigned long nsid, unsigned long hostid, + unsigned long range) +{ + char map[100], procfile[256]; + + snprintf(procfile, sizeof(procfile), "/proc/%d/uid_map", pid); + snprintf(map, sizeof(map), "%lu %lu %lu", nsid, hostid, range); + if (write_file(procfile, map, strlen(map))) + return -1; + + + snprintf(procfile, sizeof(procfile), "/proc/%d/gid_map", pid); + snprintf(map, sizeof(map), "%lu %lu %lu", nsid, hostid, range); + if (write_file(procfile, map, strlen(map))) + return -1; + + return 0; +} + +#define __STACK_SIZE (8 * 1024 * 1024) +static pid_t do_clone(int (*fn)(void *), void *arg, int flags) +{ + void *stack; + + stack = malloc(__STACK_SIZE); + if (!stack) + return -ENOMEM; + +#ifdef __ia64__ + return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#else + return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#endif +} + +static int get_userns_fd_cb(void *data) +{ + return kill(getpid(), SIGSTOP); +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range) +{ + int ret; + pid_t pid; + char path[256]; + + pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER); + if (pid < 0) + return -errno; + + ret = map_ids(pid, nsid, hostid, range); + if (ret < 0) + return ret; + + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + ret = open(path, O_RDONLY | O_CLOEXEC); + kill(pid, SIGKILL); + wait_for_pid(pid); + return ret; +} + +/** + * Validate that an attached mount in our mount namespace can be idmapped. + * (The kernel enforces that the mount's mount namespace and the caller's mount + * namespace match.) + */ +TEST_F(mount_setattr_idmapped, attached_mount_inside_current_mount_namespace) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/D", + AT_EMPTY_PATH | + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC); + ASSERT_GE(open_tree_fd, 0); + + attr.userns_fd = get_userns_fd(0, 10000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_EQ(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); +} + +/** + * Validate that idmapping a mount is rejected if the mount's mount namespace + * and our mount namespace don't match. + * (The kernel enforces that the mount's mount namespace and the caller's mount + * namespace match.) + */ +TEST_F(mount_setattr_idmapped, attached_mount_outside_current_mount_namespace) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/D", + AT_EMPTY_PATH | + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC); + ASSERT_GE(open_tree_fd, 0); + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + attr.userns_fd = get_userns_fd(0, 10000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, + sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); +} + +/** + * Validate that an attached mount in our mount namespace can be idmapped. + */ +TEST_F(mount_setattr_idmapped, detached_mount_inside_current_mount_namespace) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/D", + AT_EMPTY_PATH | + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(open_tree_fd, 0); + + /* Changing mount properties on a detached mount. */ + attr.userns_fd = get_userns_fd(0, 10000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_EQ(sys_mount_setattr(open_tree_fd, "", + AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); +} + +/** + * Validate that a detached mount not in our mount namespace can be idmapped. + */ +TEST_F(mount_setattr_idmapped, detached_mount_outside_current_mount_namespace) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/D", + AT_EMPTY_PATH | + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(open_tree_fd, 0); + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + /* Changing mount properties on a detached mount. */ + attr.userns_fd = get_userns_fd(0, 10000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_EQ(sys_mount_setattr(open_tree_fd, "", + AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); +} + +/** + * Validate that currently changing the idmapping of an idmapped mount fails. + */ +TEST_F(mount_setattr_idmapped, change_idmapping) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/D", + AT_EMPTY_PATH | + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(open_tree_fd, 0); + + attr.userns_fd = get_userns_fd(0, 10000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_EQ(sys_mount_setattr(open_tree_fd, "", + AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + + /* Change idmapping on a detached mount that is already idmapped. */ + attr.userns_fd = get_userns_fd(0, 20000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); +} + +static bool expected_uid_gid(int dfd, const char *path, int flags, + uid_t expected_uid, gid_t expected_gid) +{ + int ret; + struct stat st; + + ret = fstatat(dfd, path, &st, flags); + if (ret < 0) + return false; + + return st.st_uid == expected_uid && st.st_gid == expected_gid; +} + +TEST_F(mount_setattr_idmapped, idmap_mount_tree_invalid) +{ + int open_tree_fd = -EBADF; + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + + if (!mount_setattr_supported()) + SKIP(return, "mount_setattr syscall not supported"); + + ASSERT_EQ(expected_uid_gid(-EBADF, "/tmp/B/b", 0, 0, 0), 0); + ASSERT_EQ(expected_uid_gid(-EBADF, "/tmp/B/BB/b", 0, 0, 0), 0); + + open_tree_fd = sys_open_tree(-EBADF, "/mnt/A", + AT_RECURSIVE | + AT_EMPTY_PATH | + AT_NO_AUTOMOUNT | + AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(open_tree_fd, 0); + + attr.userns_fd = get_userns_fd(0, 10000, 10000); + ASSERT_GE(attr.userns_fd, 0); + ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); + ASSERT_EQ(close(open_tree_fd), 0); + + ASSERT_EQ(expected_uid_gid(-EBADF, "/tmp/B/b", 0, 0, 0), 0); + ASSERT_EQ(expected_uid_gid(-EBADF, "/tmp/B/BB/b", 0, 0, 0), 0); + ASSERT_EQ(expected_uid_gid(open_tree_fd, "B/b", 0, 0, 0), 0); + ASSERT_EQ(expected_uid_gid(open_tree_fd, "B/BB/b", 0, 0, 0), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 4c7d33618437..d98fb85e201c 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -1524,6 +1524,14 @@ basic() run_cmd "$IP nexthop replace id 2 blackhole dev veth1" log_test $? 2 "Blackhole nexthop with other attributes" + # blackhole nexthop should not be affected by the state of the loopback + # device + run_cmd "$IP link set dev lo down" + check_nexthop "id 2" "id 2 blackhole" + log_test $? 0 "Blackhole nexthop with loopback device down" + + run_cmd "$IP link set dev lo up" + # # groups # diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh index 197e769c2ed1..f8cda822c1ce 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh @@ -86,11 +86,20 @@ test_ip6gretap() test_gretap_stp() { + # Sometimes after mirror installation, the neighbor's state is not valid. + # The reason is that there is no SW datapath activity related to the + # neighbor for the remote GRE address. Therefore whether the corresponding + # neighbor will be valid is a matter of luck, and the test is thus racy. + # Set the neighbor's state to permanent, so it would be always valid. + ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_stp gt4 $swp3.555 "mirror to gretap" } test_ip6gretap_stp() { + ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \ + nud permanent dev br2 full_test_span_gre_stp gt6 $swp3.555 "mirror to ip6gretap" } diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index ce6bea9675c0..0ccb1dda099a 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -658,7 +658,7 @@ test_ecn_decap() # In accordance with INET_ECN_decapsulate() __test_ecn_decap 00 00 0x00 __test_ecn_decap 01 01 0x01 - __test_ecn_decap 02 01 0x02 + __test_ecn_decap 02 01 0x01 __test_ecn_decap 01 03 0x03 __test_ecn_decap 02 03 0x03 test_ecn_decap_error diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index 17ced7d6ce25..f23438d512c5 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -1785,7 +1785,7 @@ static void grand_child_serv(unsigned int nr, int cmd_fd, void *buf, break; default: printk("got unknown msg type %d", msg->type); - }; + } } static int grand_child_f(unsigned int nr, int cmd_fd, void *buf) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 964db9ed544f..ad32240fbfda 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -11,6 +11,7 @@ ksft_skip=4 timeout=30 mptcp_connect="" capture=0 +do_all_tests=1 TEST_COUNT=0 @@ -121,12 +122,6 @@ reset_with_add_addr_timeout() -j DROP } -for arg in "$@"; do - if [ "$arg" = "-c" ]; then - capture=1 - fi -done - ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" @@ -1221,7 +1216,8 @@ usage() echo " -4 v4mapped_tests" echo " -b backup_tests" echo " -p add_addr_ports_tests" - echo " -c syncookies_tests" + echo " -k syncookies_tests" + echo " -c capture pcap files" echo " -h help" } @@ -1235,12 +1231,24 @@ make_file "$cin" "client" 1 make_file "$sin" "server" 1 trap cleanup EXIT -if [ -z $1 ]; then +for arg in "$@"; do + # check for "capture" arg before launching tests + if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then + capture=1 + fi + + # exception for the capture option, the rest means: a part of the tests + if [ "${arg}" != "-c" ]; then + do_all_tests=0 + fi +done + +if [ $do_all_tests -eq 1 ]; then all_tests exit $ret fi -while getopts 'fsltra64bpch' opt; do +while getopts 'fsltra64bpkch' opt; do case $opt in f) subflows_tests @@ -1272,9 +1280,11 @@ while getopts 'fsltra64bpch' opt; do p) add_addr_ports_tests ;; - c) + k) syncookies_tests ;; + c) + ;; h | *) usage ;; diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c index 7b01b7c2ec10..066efd30e294 100644 --- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -30,25 +30,25 @@ struct reuse_opts { }; struct reuse_opts unreusable_opts[12] = { - {0, 0, 0, 0}, - {0, 0, 0, 1}, - {0, 0, 1, 0}, - {0, 0, 1, 1}, - {0, 1, 0, 0}, - {0, 1, 0, 1}, - {0, 1, 1, 0}, - {0, 1, 1, 1}, - {1, 0, 0, 0}, - {1, 0, 0, 1}, - {1, 0, 1, 0}, - {1, 0, 1, 1}, + {{0, 0}, {0, 0}}, + {{0, 0}, {0, 1}}, + {{0, 0}, {1, 0}}, + {{0, 0}, {1, 1}}, + {{0, 1}, {0, 0}}, + {{0, 1}, {0, 1}}, + {{0, 1}, {1, 0}}, + {{0, 1}, {1, 1}}, + {{1, 0}, {0, 0}}, + {{1, 0}, {0, 1}}, + {{1, 0}, {1, 0}}, + {{1, 0}, {1, 1}}, }; struct reuse_opts reusable_opts[4] = { - {1, 1, 0, 0}, - {1, 1, 0, 1}, - {1, 1, 1, 0}, - {1, 1, 1, 1}, + {{1, 1}, {0, 0}}, + {{1, 1}, {0, 1}}, + {{1, 1}, {1, 0}}, + {{1, 1}, {1, 1}}, }; int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport) diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index 3006a8e5b41a..3171069a6b46 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -4,7 +4,7 @@ TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh \ + nft_queue.sh nft_meta.sh nf_nat_edemux.sh \ ipip-conntrack-mtu.sh LDLIBS = -lmnl diff --git a/tools/testing/selftests/netfilter/nf_nat_edemux.sh b/tools/testing/selftests/netfilter/nf_nat_edemux.sh new file mode 100755 index 000000000000..cfee3b65be0f --- /dev/null +++ b/tools/testing/selftests/netfilter/nf_nat_edemux.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test NAT source port clash resolution +# + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +sfx=$(mktemp -u "XXXXXXXX") +ns1="ns1-$sfx" +ns2="ns2-$sfx" + +cleanup() +{ + ip netns del $ns1 + ip netns del $ns2 +} + +iperf3 -v > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iperf3" + exit $ksft_skip +fi + +iptables --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without iptables" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +ip netns add "$ns1" +if [ $? -ne 0 ];then + echo "SKIP: Could not create net namespace $ns1" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns2 + +# Connect the namespaces using a veth pair +ip link add name veth2 type veth peer name veth1 +ip link set netns $ns1 dev veth1 +ip link set netns $ns2 dev veth2 + +ip netns exec $ns1 ip link set up dev lo +ip netns exec $ns1 ip link set up dev veth1 +ip netns exec $ns1 ip addr add 192.168.1.1/24 dev veth1 + +ip netns exec $ns2 ip link set up dev lo +ip netns exec $ns2 ip link set up dev veth2 +ip netns exec $ns2 ip addr add 192.168.1.2/24 dev veth2 + +# Create a server in one namespace +ip netns exec $ns1 iperf3 -s > /dev/null 2>&1 & +iperfs=$! + +# Restrict source port to just one so we don't have to exhaust +# all others. +ip netns exec $ns2 sysctl -q net.ipv4.ip_local_port_range="10000 10000" + +# add a virtual IP using DNAT +ip netns exec $ns2 iptables -t nat -A OUTPUT -d 10.96.0.1/32 -p tcp --dport 443 -j DNAT --to-destination 192.168.1.1:5201 + +# ... and route it to the other namespace +ip netns exec $ns2 ip route add 10.96.0.1 via 192.168.1.1 + +sleep 1 + +# add a persistent connection from the other namespace +ip netns exec $ns2 nc -q 10 -w 10 192.168.1.1 5201 > /dev/null & + +sleep 1 + +# ip daddr:dport will be rewritten to 192.168.1.1 5201 +# NAT must reallocate source port 10000 because +# 192.168.1.2:10000 -> 192.168.1.1:5201 is already in use +echo test | ip netns exec $ns2 nc -w 3 -q 3 10.96.0.1 443 >/dev/null +ret=$? + +kill $iperfs + +# Check nc can connect to 10.96.0.1:443 (aka 192.168.1.1:5201). +if [ $ret -eq 0 ]; then + echo "PASS: nc can connect via NAT'd address" +else + echo "FAIL: nc cannot connect via NAT'd address" + exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d42115e4284d..8b0cd421ebd3 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -101,7 +101,7 @@ endef ifeq ($(CAN_BUILD_I386),1) $(BINARIES_32): CFLAGS += -m32 $(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): %_32: %.c +$(BINARIES_32): $(OUTPUT)/%_32: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif @@ -109,7 +109,7 @@ endif ifeq ($(CAN_BUILD_X86_64),1) $(BINARIES_64): CFLAGS += -m64 $(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): %_64: %.c +$(BINARIES_64): $(OUTPUT)/%_64: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) endif diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 74c69b75f6f5..7ed7cd95e58f 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -39,7 +39,7 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; } ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; } ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; } sleep() { read -t "$1" -N 1 || true; } -waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } +waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; } @@ -141,6 +141,19 @@ tests() { n2 iperf3 -s -1 -B fd00::2 & waitiperf $netns2 $! n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2 + + # TCP over IPv4, in parallel + for max in 4 5 50; do + local pids=( ) + for ((i=0; i < max; ++i)) do + n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 & + pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i )) + done + for ((i=0; i < max; ++i)) do + n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 & + done + wait "${pids[@]}" + done } [[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}" diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index 57b20802e71b..b69de9263ee6 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1650,7 +1650,7 @@ static void start_printthread(void) if (ufd < 0 || read(ufd, seed, sizeof(*seed)) != sizeof(*seed)) { printf( -"Warning! Using trivial random nummer seed, since %s not available\n", +"Warning! Using trivial random number seed, since %s not available\n", DEV_URANDOM); fflush(stdout); *seed = i; @@ -1711,8 +1711,8 @@ static void show_usage(void) "\t\t\tbeginning, end, and backtrace.\n\n" "-g, --graph\t\tEnable the display-graph option in trace_option. This\n" -"\t\t\toption causes ftrace to show the functionph of how\n" -"\t\t\tfunctions are calling other functions.\n\n" +"\t\t\toption causes ftrace to show the graph of how functions\n" +"\t\t\tare calling other functions.\n\n" "-c, --policy POL\tRun the program with scheduling policy POL. POL can be\n" "\t\t\tother, batch, idle, rr or fifo. The default is rr. When\n" |