diff options
Diffstat (limited to 'tools')
47 files changed, 2208 insertions, 450 deletions
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 03acc823838a..a77b915d36a8 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -203,8 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ @@ -296,6 +296,13 @@ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ #define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ #define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -316,6 +323,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ @@ -447,5 +455,6 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 36369e76cc63..33d2cd04d254 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -50,6 +50,25 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif +#ifdef CONFIG_RETPOLINE +# define DISABLE_RETPOLINE 0 +#else +# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ + (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) +#endif + +#ifdef CONFIG_RETHUNK +# define DISABLE_RETHUNK 0 +#else +# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) +#endif + +#ifdef CONFIG_CPU_UNRET_ENTRY +# define DISABLE_UNRET 0 +#else +# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM # define DISABLE_ENQCMD 0 #else @@ -82,7 +101,7 @@ #define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 -#define DISABLED_MASK11 0 +#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) #define DISABLED_MASK12 0 #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index d27e0581b777..cc615be27a54 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -51,6 +51,8 @@ #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -93,6 +95,7 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO BIT(4) /* * Not susceptible to Speculative Store Bypass @@ -140,6 +143,13 @@ * bit available to control VERW * behavior. */ +#define ARCH_CAP_RRSBA BIT(19) /* + * Indicates RET may use predictors + * other than the RSB. With eIBRS + * enabled predictions in kernel mode + * are restricted to targets in + * kernel. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -567,6 +577,9 @@ /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 15b940ec1eac..10bc88cc3bf6 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -32,11 +32,16 @@ struct unwind_hint { * * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function. * Useful for code which doesn't have an ELF function annotation. + * + * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc. */ #define UNWIND_HINT_TYPE_CALL 0 #define UNWIND_HINT_TYPE_REGS 1 #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 +#define UNWIND_HINT_TYPE_ENTRY 4 +#define UNWIND_HINT_TYPE_SAVE 5 +#define UNWIND_HINT_TYPE_RESTORE 6 #ifdef CONFIG_OBJTOOL @@ -124,7 +129,7 @@ struct unwind_hint { * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints /* struct unwind_hint */ @@ -177,7 +182,7 @@ struct unwind_hint { #define ASM_REACHABLE #else #define ANNOTATE_INTRA_FUNCTION_CALL -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h index 0197042b7dfb..1ecdb911add8 100644 --- a/tools/include/uapi/asm-generic/fcntl.h +++ b/tools/include/uapi/asm-generic/fcntl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_GENERIC_FCNTL_H #define _ASM_GENERIC_FCNTL_H @@ -90,7 +91,7 @@ /* a horrid kludge trying to make sure that this will fail on old kernels */ #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) +#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT) #ifndef O_NDELAY #define O_NDELAY O_NONBLOCK @@ -115,11 +116,13 @@ #define F_GETSIG 11 /* for sockets. */ #endif +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 @@ -178,6 +181,10 @@ struct f_owner_ex { blocking */ #define LOCK_UN 8 /* remove lock */ +/* + * LOCK_MAND support has been removed from the kernel. We leave the symbols + * here to not break legacy builds, but these should not be used in new code. + */ #define LOCK_MAND 32 /* This is a mandatory flock ... */ #define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ @@ -185,6 +192,7 @@ struct f_owner_ex { #define F_LINUX_SPECIFIC_BASE 1024 +#ifndef HAVE_ARCH_STRUCT_FLOCK struct flock { short l_type; short l_whence; @@ -209,5 +217,6 @@ struct flock64 { __ARCH_FLOCK64_PAD #endif }; +#endif /* HAVE_ARCH_STRUCT_FLOCK */ #endif /* _ASM_GENERIC_FCNTL_H */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4009dbdf62d..ef78e0e1a754 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5222,22 +5222,25 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset) + * long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length - * of *src*'s data, -EINVAL if *src* is an invalid dynptr. + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. * - * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len) + * long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. + * *flags* is currently unused. * Return * 0 on success, -E2BIG if *offset* + *len* exceeds the length * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* - * is a read-only dynptr. + * is a read-only dynptr or if *flags* is not 0. * * void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len) * Description diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5088bd9f1922..860f867c50c0 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -2083,7 +2083,8 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) -#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES +#define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_BOOLEAN #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 8b990a52aada..c260006106be 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -787,3 +787,8 @@ bool arch_is_retpoline(struct symbol *sym) { return !strncmp(sym->name, "__x86_indirect_", 15); } + +bool arch_is_rethunk(struct symbol *sym) +{ + return !strcmp(sym->name, "__x86_return_thunk"); +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index f4c3a5091737..24fbe803a0d3 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -68,6 +68,8 @@ const struct option check_options[] = { OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"), OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), + OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), + OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), @@ -123,6 +125,7 @@ static bool opts_valid(void) opts.noinstr || opts.orc || opts.retpoline || + opts.rethunk || opts.sls || opts.stackval || opts.static_call || @@ -135,6 +138,11 @@ static bool opts_valid(void) return true; } + if (opts.unret && !opts.rethunk) { + ERROR("--unret requires --rethunk"); + return false; + } + if (opts.dump_orc) return true; @@ -163,6 +171,11 @@ static bool link_opts_valid(struct objtool_file *file) return false; } + if (opts.unret) { + ERROR("--unret requires --link"); + return false; + } + return true; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 57153e00349c..b341f8a8c7c5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -376,7 +376,8 @@ static int decode_instructions(struct objtool_file *file) sec->text = true; if (!strcmp(sec->name, ".noinstr.text") || - !strcmp(sec->name, ".entry.text")) + !strcmp(sec->name, ".entry.text") || + !strncmp(sec->name, ".text.__x86.", 12)) sec->noinstr = true; for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { @@ -749,6 +750,52 @@ static int create_retpoline_sites_sections(struct objtool_file *file) return 0; } +static int create_return_sites_sections(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + int idx; + + sec = find_section_by_name(file->elf, ".return_sites"); + if (sec) { + WARN("file already has .return_sites, skipping"); + return 0; + } + + idx = 0; + list_for_each_entry(insn, &file->return_thunk_list, call_node) + idx++; + + if (!idx) + return 0; + + sec = elf_create_section(file->elf, ".return_sites", 0, + sizeof(int), idx); + if (!sec) { + WARN("elf_create_section: .return_sites"); + return -1; + } + + idx = 0; + list_for_each_entry(insn, &file->return_thunk_list, call_node) { + + int *site = (int *)sec->data->d_buf + idx; + *site = 0; + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(int), + R_X86_64_PC32, + insn->sec, insn->offset)) { + WARN("elf_add_reloc_to_insn: .return_sites"); + return -1; + } + + idx++; + } + + return 0; +} + static int create_ibt_endbr_seal_sections(struct objtool_file *file) { struct instruction *insn; @@ -1083,6 +1130,11 @@ __weak bool arch_is_retpoline(struct symbol *sym) return false; } +__weak bool arch_is_rethunk(struct symbol *sym) +{ + return false; +} + #define NEGATIVE_RELOC ((void *)-1L) static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *insn) @@ -1250,6 +1302,19 @@ static void add_retpoline_call(struct objtool_file *file, struct instruction *in annotate_call_site(file, insn, false); } +static void add_return_call(struct objtool_file *file, struct instruction *insn, bool add) +{ + /* + * Return thunk tail calls are really just returns in disguise, + * so convert them accordingly. + */ + insn->type = INSN_RETURN; + insn->retpoline_safe = true; + + if (add) + list_add_tail(&insn->call_node, &file->return_thunk_list); +} + static bool same_function(struct instruction *insn1, struct instruction *insn2) { return insn1->func->pfunc == insn2->func->pfunc; @@ -1302,6 +1367,9 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->retpoline_thunk) { add_retpoline_call(file, insn); continue; + } else if (reloc->sym->return_thunk) { + add_return_call(file, insn, true); + continue; } else if (insn->func) { /* * External sibling call or internal sibling call with @@ -1320,6 +1388,21 @@ static int add_jump_destinations(struct objtool_file *file) jump_dest = find_insn(file, dest_sec, dest_off); if (!jump_dest) { + struct symbol *sym = find_symbol_by_offset(dest_sec, dest_off); + + /* + * This is a special case for zen_untrain_ret(). + * It jumps to __x86_return_thunk(), but objtool + * can't find the thunk's starting RET + * instruction, because the RET is also in the + * middle of another instruction. Objtool only + * knows about the outer instruction. + */ + if (sym && sym->return_thunk) { + add_return_call(file, insn, false); + continue; + } + WARN_FUNC("can't find jump dest instruction at %s+0x%lx", insn->sec, insn->offset, dest_sec->name, dest_off); @@ -1949,16 +2032,35 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; - if (opts.ibt && hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { + if (hint->type == UNWIND_HINT_TYPE_SAVE) { + insn->hint = false; + insn->save = true; + continue; + } + + if (hint->type == UNWIND_HINT_TYPE_RESTORE) { + insn->restore = true; + continue; + } + + if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); - if (sym && sym->bind == STB_GLOBAL && - insn->type != INSN_ENDBR && !insn->noendbr) { - WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR", - insn->sec, insn->offset); + if (sym && sym->bind == STB_GLOBAL) { + if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) { + WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR", + insn->sec, insn->offset); + } + + insn->entry = 1; } } + if (hint->type == UNWIND_HINT_TYPE_ENTRY) { + hint->type = UNWIND_HINT_TYPE_CALL; + insn->entry = 1; + } + if (hint->type == UNWIND_HINT_TYPE_FUNC) { insn->cfi = &func_cfi; continue; @@ -2032,8 +2134,10 @@ static int read_retpoline_hints(struct objtool_file *file) } if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC) { - WARN_FUNC("retpoline_safe hint not an indirect jump/call", + insn->type != INSN_CALL_DYNAMIC && + insn->type != INSN_RETURN && + insn->type != INSN_NOP) { + WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret/nop", insn->sec, insn->offset); return -1; } @@ -2184,6 +2288,9 @@ static int classify_symbols(struct objtool_file *file) if (arch_is_retpoline(func)) func->retpoline_thunk = true; + if (arch_is_rethunk(func)) + func->return_thunk = true; + if (!strcmp(func->name, "__fentry__")) func->fentry = true; @@ -3218,8 +3325,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 1; } - visited = 1 << state.uaccess; - if (insn->visited) { + visited = VISITED_BRANCH << state.uaccess; + if (insn->visited & VISITED_BRANCH_MASK) { if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) return 1; @@ -3233,6 +3340,35 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, state.instr += insn->instr; if (insn->hint) { + if (insn->restore) { + struct instruction *save_insn, *i; + + i = insn; + save_insn = NULL; + + sym_for_each_insn_continue_reverse(file, func, i) { + if (i->save) { + save_insn = i; + break; + } + } + + if (!save_insn) { + WARN_FUNC("no corresponding CFI save for CFI restore", + sec, insn->offset); + return 1; + } + + if (!save_insn->visited) { + WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", + sec, insn->offset); + return 1; + } + + insn->cfi = save_insn->cfi; + nr_cfi_reused++; + } + state.cfi = *insn->cfi; } else { /* XXX track if we actually changed state.cfi */ @@ -3433,6 +3569,145 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec) return warnings; } +/* + * Validate rethunk entry constraint: must untrain RET before the first RET. + * + * Follow every branch (intra-function) and ensure ANNOTATE_UNRET_END comes + * before an actual RET instruction. + */ +static int validate_entry(struct objtool_file *file, struct instruction *insn) +{ + struct instruction *next, *dest; + int ret, warnings = 0; + + for (;;) { + next = next_insn_to_validate(file, insn); + + if (insn->visited & VISITED_ENTRY) + return 0; + + insn->visited |= VISITED_ENTRY; + + if (!insn->ignore_alts && !list_empty(&insn->alts)) { + struct alternative *alt; + bool skip_orig = false; + + list_for_each_entry(alt, &insn->alts, list) { + if (alt->skip_orig) + skip_orig = true; + + ret = validate_entry(file, alt->insn); + if (ret) { + if (opts.backtrace) + BT_FUNC("(alt)", insn); + return ret; + } + } + + if (skip_orig) + return 0; + } + + switch (insn->type) { + + case INSN_CALL_DYNAMIC: + case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: + WARN_FUNC("early indirect call", insn->sec, insn->offset); + return 1; + + case INSN_JUMP_UNCONDITIONAL: + case INSN_JUMP_CONDITIONAL: + if (!is_sibling_call(insn)) { + if (!insn->jump_dest) { + WARN_FUNC("unresolved jump target after linking?!?", + insn->sec, insn->offset); + return -1; + } + ret = validate_entry(file, insn->jump_dest); + if (ret) { + if (opts.backtrace) { + BT_FUNC("(branch%s)", insn, + insn->type == INSN_JUMP_CONDITIONAL ? "-cond" : ""); + } + return ret; + } + + if (insn->type == INSN_JUMP_UNCONDITIONAL) + return 0; + + break; + } + + /* fallthrough */ + case INSN_CALL: + dest = find_insn(file, insn->call_dest->sec, + insn->call_dest->offset); + if (!dest) { + WARN("Unresolved function after linking!?: %s", + insn->call_dest->name); + return -1; + } + + ret = validate_entry(file, dest); + if (ret) { + if (opts.backtrace) + BT_FUNC("(call)", insn); + return ret; + } + /* + * If a call returns without error, it must have seen UNTRAIN_RET. + * Therefore any non-error return is a success. + */ + return 0; + + case INSN_RETURN: + WARN_FUNC("RET before UNTRAIN", insn->sec, insn->offset); + return 1; + + case INSN_NOP: + if (insn->retpoline_safe) + return 0; + break; + + default: + break; + } + + if (!next) { + WARN_FUNC("teh end!", insn->sec, insn->offset); + return -1; + } + insn = next; + } + + return warnings; +} + +/* + * Validate that all branches starting at 'insn->entry' encounter UNRET_END + * before RET. + */ +static int validate_unret(struct objtool_file *file) +{ + struct instruction *insn; + int ret, warnings = 0; + + for_each_insn(file, insn) { + if (!insn->entry) + continue; + + ret = validate_entry(file, insn); + if (ret < 0) { + WARN_FUNC("Failed UNRET validation", insn->sec, insn->offset); + return ret; + } + warnings += ret; + } + + return warnings; +} + static int validate_retpoline(struct objtool_file *file) { struct instruction *insn; @@ -3440,7 +3715,8 @@ static int validate_retpoline(struct objtool_file *file) for_each_insn(file, insn) { if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC) + insn->type != INSN_CALL_DYNAMIC && + insn->type != INSN_RETURN) continue; if (insn->retpoline_safe) @@ -3455,9 +3731,17 @@ static int validate_retpoline(struct objtool_file *file) if (!strcmp(insn->sec->name, ".init.text") && !opts.module) continue; - WARN_FUNC("indirect %s found in RETPOLINE build", - insn->sec, insn->offset, - insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call"); + if (insn->type == INSN_RETURN) { + if (opts.rethunk) { + WARN_FUNC("'naked' return found in RETHUNK build", + insn->sec, insn->offset); + } else + continue; + } else { + WARN_FUNC("indirect %s found in RETPOLINE build", + insn->sec, insn->offset, + insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call"); + } warnings++; } @@ -3945,6 +4229,17 @@ int check(struct objtool_file *file) warnings += ret; } + if (opts.unret) { + /* + * Must be after validate_branch() and friends, it plays + * further games with insn->visited. + */ + ret = validate_unret(file); + if (ret < 0) + return ret; + warnings += ret; + } + if (opts.ibt) { ret = validate_ibt(file); if (ret < 0) @@ -3973,6 +4268,13 @@ int check(struct objtool_file *file) warnings += ret; } + if (opts.rethunk) { + ret = create_return_sites_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + if (opts.mcount) { ret = create_mcount_loc_sections(file); if (ret < 0) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 9b19cc304195..beb2f3aa94ff 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -89,6 +89,7 @@ const char *arch_ret_insn(int len); int arch_decode_hint_reg(u8 sp_reg, int *base); bool arch_is_retpoline(struct symbol *sym); +bool arch_is_rethunk(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 280ea18b7f2b..42a52f1a0add 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -19,6 +19,8 @@ struct opts { bool noinstr; bool orc; bool retpoline; + bool rethunk; + bool unret; bool sls; bool stackval; bool static_call; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index f10d7374f388..036129cebeee 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -46,16 +46,19 @@ struct instruction { enum insn_type type; unsigned long immediate; - u8 dead_end : 1, - ignore : 1, - ignore_alts : 1, - hint : 1, - retpoline_safe : 1, - noendbr : 1; - /* 2 bit hole */ + u16 dead_end : 1, + ignore : 1, + ignore_alts : 1, + hint : 1, + save : 1, + restore : 1, + retpoline_safe : 1, + noendbr : 1, + entry : 1; + /* 7 bit hole */ + s8 instr; u8 visited; - /* u8 hole */ struct alt_group *alt_group; struct symbol *call_dest; @@ -69,6 +72,11 @@ struct instruction { struct cfi_state *cfi; }; +#define VISITED_BRANCH 0x01 +#define VISITED_BRANCH_UACCESS 0x02 +#define VISITED_BRANCH_MASK 0x03 +#define VISITED_ENTRY 0x04 + static inline bool is_static_jump(struct instruction *insn) { return insn->type == INSN_JUMP_CONDITIONAL || diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index adebfbc2b518..16f4067b82ae 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -57,6 +57,7 @@ struct symbol { u8 uaccess_safe : 1; u8 static_call_tramp : 1; u8 retpoline_thunk : 1; + u8 return_thunk : 1; u8 fentry : 1; u8 profiling_func : 1; struct list_head pv_target; diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index a6e72d916807..7f2d1b095333 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -24,6 +24,7 @@ struct objtool_file { struct list_head insn_list; DECLARE_HASHTABLE(insn_hash, 20); struct list_head retpoline_call_list; + struct list_head return_thunk_list; struct list_head static_call_list; struct list_head mcount_loc_list; struct list_head endbr_list; diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 512669ce064c..a7ecc32e3512 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -102,6 +102,7 @@ struct objtool_file *objtool_open_read(const char *_objname) INIT_LIST_HEAD(&file.insn_list); hash_init(file.insn_hash); INIT_LIST_HEAD(&file.retpoline_call_list); + INIT_LIST_HEAD(&file.return_thunk_list); INIT_LIST_HEAD(&file.static_call_list); INIT_LIST_HEAD(&file.mcount_loc_list); INIT_LIST_HEAD(&file.endbr_list); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 897fc504918b..f075cf37a65e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -4280,6 +4280,7 @@ static int trace__replay(struct trace *trace) goto out; evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter"); + trace->syscalls.events.sys_enter = evsel; /* older kernels have syscalls tp versus raw_syscalls */ if (evsel == NULL) evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter"); @@ -4292,6 +4293,7 @@ static int trace__replay(struct trace *trace) } evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit"); + trace->syscalls.events.sys_exit = evsel; if (evsel == NULL) evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py index 5f57d9829956..4339692a8d0b 100755 --- a/tools/perf/scripts/python/arm-cs-trace-disasm.py +++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py @@ -61,7 +61,7 @@ def get_optional(perf_dict, field): def get_offset(perf_dict, field): if field in perf_dict: - return f"+0x{perf_dict[field]:x}" + return "+%#x" % perf_dict[field] return "" def get_dso_file_path(dso_name, dso_build_id): @@ -76,7 +76,7 @@ def get_dso_file_path(dso_name, dso_build_id): else: append = "/elf" - dso_path = f"{os.environ['PERF_BUILDID_DIR']}/{dso_name}/{dso_build_id}{append}" + dso_path = os.environ['PERF_BUILDID_DIR'] + "/" + dso_name + "/" + dso_build_id + append; # Replace duplicate slash chars to single slash char dso_path = dso_path.replace('//', '/', 1) return dso_path @@ -94,8 +94,8 @@ def read_disam(dso_fname, dso_start, start_addr, stop_addr): start_addr = start_addr - dso_start; stop_addr = stop_addr - dso_start; disasm = [ options.objdump_name, "-d", "-z", - f"--start-address=0x{start_addr:x}", - f"--stop-address=0x{stop_addr:x}" ] + "--start-address="+format(start_addr,"#x"), + "--stop-address="+format(stop_addr,"#x") ] disasm += [ dso_fname ] disasm_output = check_output(disasm).decode('utf-8').split('\n') disasm_cache[addr_range] = disasm_output @@ -109,12 +109,14 @@ def print_disam(dso_fname, dso_start, start_addr, stop_addr): m = disasm_re.search(line) if m is None: continue - print(f"\t{line}") + print("\t" + line) def print_sample(sample): - print(f"Sample = {{ cpu: {sample['cpu']:04} addr: 0x{sample['addr']:016x} " \ - f"phys_addr: 0x{sample['phys_addr']:016x} ip: 0x{sample['ip']:016x} " \ - f"pid: {sample['pid']} tid: {sample['tid']} period: {sample['period']} time: {sample['time']} }}") + print("Sample = { cpu: %04d addr: 0x%016x phys_addr: 0x%016x ip: 0x%016x " \ + "pid: %d tid: %d period: %d time: %d }" % \ + (sample['cpu'], sample['addr'], sample['phys_addr'], \ + sample['ip'], sample['pid'], sample['tid'], \ + sample['period'], sample['time'])) def trace_begin(): print('ARM CoreSight Trace Data Assembler Dump') @@ -131,7 +133,7 @@ def common_start_str(comm, sample): cpu = sample["cpu"] pid = sample["pid"] tid = sample["tid"] - return f"{comm:>16} {pid:>5}/{tid:<5} [{cpu:04}] {sec:9}.{ns:09} " + return "%16s %5u/%-5u [%04u] %9u.%09u " % (comm, pid, tid, cpu, sec, ns) # This code is copied from intel-pt-events.py for printing source code # line and symbols. @@ -171,7 +173,7 @@ def print_srccode(comm, param_dict, sample, symbol, dso): glb_line_number = line_number glb_source_file_name = source_file_name - print(f"{start_str}{src_str}") + print(start_str, src_str) def process_event(param_dict): global cache_size @@ -188,7 +190,7 @@ def process_event(param_dict): symbol = get_optional(param_dict, "symbol") if (options.verbose == True): - print(f"Event type: {name}") + print("Event type: %s" % name) print_sample(sample) # If cannot find dso so cannot dump assembler, bail out @@ -197,7 +199,7 @@ def process_event(param_dict): # Validate dso start and end addresses if ((dso_start == '[unknown]') or (dso_end == '[unknown]')): - print(f"Failed to find valid dso map for dso {dso}") + print("Failed to find valid dso map for dso %s" % dso) return if (name[0:12] == "instructions"): @@ -244,15 +246,15 @@ def process_event(param_dict): # Handle CS_ETM_TRACE_ON packet if start_addr=0 and stop_addr=4 if (start_addr == 0 and stop_addr == 4): - print(f"CPU{cpu}: CS_ETM_TRACE_ON packet is inserted") + print("CPU%d: CS_ETM_TRACE_ON packet is inserted" % cpu) return if (start_addr < int(dso_start) or start_addr > int(dso_end)): - print(f"Start address 0x{start_addr:x} is out of range [ 0x{dso_start:x} .. 0x{dso_end:x} ] for dso {dso}") + print("Start address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (start_addr, int(dso_start), int(dso_end), dso)) return if (stop_addr < int(dso_start) or stop_addr > int(dso_end)): - print(f"Stop address 0x{stop_addr:x} is out of range [ 0x{dso_start:x} .. 0x{dso_end:x} ] for dso {dso}") + print("Stop address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (stop_addr, int(dso_start), int(dso_end), dso)) return if (options.objdump_name != None): @@ -267,6 +269,6 @@ def process_event(param_dict): if path.exists(dso_fname): print_disam(dso_fname, dso_vm_start, start_addr, stop_addr) else: - print(f"Failed to find dso {dso} for address range [ 0x{start_addr:x} .. 0x{stop_addr:x} ]") + print("Failed to find dso %s for address range [ 0x%x .. 0x%x ]" % (dso, start_addr, stop_addr)) print_srccode(comm, param_dict, sample, symbol, dso) diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c index 4ad0dfbc8b21..7c7d20fc503a 100644 --- a/tools/perf/tests/perf-time-to-tsc.c +++ b/tools/perf/tests/perf-time-to-tsc.c @@ -20,8 +20,6 @@ #include "tsc.h" #include "mmap.h" #include "tests.h" -#include "pmu.h" -#include "pmu-hybrid.h" /* * Except x86_64/i386 and Arm64, other archs don't support TSC in perf. Just @@ -106,28 +104,21 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su evlist__config(evlist, &opts, NULL); - evsel = evlist__first(evlist); - - evsel->core.attr.comm = 1; - evsel->core.attr.disabled = 1; - evsel->core.attr.enable_on_exec = 0; - - /* - * For hybrid "cycles:u", it creates two events. - * Init the second evsel here. - */ - if (perf_pmu__has_hybrid() && perf_pmu__hybrid_mounted("cpu_atom")) { - evsel = evsel__next(evsel); + /* For hybrid "cycles:u", it creates two events */ + evlist__for_each_entry(evlist, evsel) { evsel->core.attr.comm = 1; evsel->core.attr.disabled = 1; evsel->core.attr.enable_on_exec = 0; } - if (evlist__open(evlist) == -ENOENT) { - err = TEST_SKIP; + ret = evlist__open(evlist); + if (ret < 0) { + if (ret == -ENOENT) + err = TEST_SKIP; + else + pr_debug("evlist__open() failed\n"); goto out_err; } - CHECK__(evlist__open(evlist)); CHECK__(evlist__mmap(evlist, UINT_MAX)); @@ -167,10 +158,12 @@ static int test__perf_time_to_tsc(struct test_suite *test __maybe_unused, int su goto next_event; if (strcmp(event->comm.comm, comm1) == 0) { + CHECK_NOT_NULL__(evsel = evlist__event2evsel(evlist, event)); CHECK__(evsel__parse_sample(evsel, event, &sample)); comm1_time = sample.time; } if (strcmp(event->comm.comm, comm2) == 0) { + CHECK_NOT_NULL__(evsel = evlist__event2evsel(evlist, event)); CHECK__(evsel__parse_sample(evsel, event, &sample)); comm2_time = sample.time; } diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index f8ad581ea247..cdd6463a5b68 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -63,20 +63,16 @@ static struct hashmap *bpf_map_hash; static struct bpf_perf_object * bpf_perf_object__next(struct bpf_perf_object *prev) { - struct bpf_perf_object *next; - - if (!prev) - next = list_first_entry(&bpf_objects_list, - struct bpf_perf_object, - list); - else - next = list_next_entry(prev, list); + if (!prev) { + if (list_empty(&bpf_objects_list)) + return NULL; - /* Empty list is noticed here so don't need checking on entry. */ - if (&next->list == &bpf_objects_list) + return list_first_entry(&bpf_objects_list, struct bpf_perf_object, list); + } + if (list_is_last(&prev->list, &bpf_objects_list)) return NULL; - return next; + return list_next_entry(prev, list); } #define bpf_perf_object__for_each(perf_obj, tmp) \ diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index ecd377938eea..b3be5b1d9dbb 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -233,6 +233,33 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep, return NULL; } +static int elf_read_program_header(Elf *elf, u64 vaddr, GElf_Phdr *phdr) +{ + size_t i, phdrnum; + u64 sz; + + if (elf_getphdrnum(elf, &phdrnum)) + return -1; + + for (i = 0; i < phdrnum; i++) { + if (gelf_getphdr(elf, i, phdr) == NULL) + return -1; + + if (phdr->p_type != PT_LOAD) + continue; + + sz = max(phdr->p_memsz, phdr->p_filesz); + if (!sz) + continue; + + if (vaddr >= phdr->p_vaddr && (vaddr < phdr->p_vaddr + sz)) + return 0; + } + + /* Not found any valid program header */ + return -1; +} + static bool want_demangle(bool is_kernel_sym) { return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle; @@ -1209,6 +1236,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, sym.st_value); used_opd = true; } + /* * When loading symbols in a data mapping, ABS symbols (which * has a value of SHN_ABS in its st_shndx) failed at @@ -1227,6 +1255,17 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, gelf_getshdr(sec, &shdr); + /* + * If the attribute bit SHF_ALLOC is not set, the section + * doesn't occupy memory during process execution. + * E.g. ".gnu.warning.*" section is used by linker to generate + * warnings when calling deprecated functions, the symbols in + * the section aren't loaded to memory during process execution, + * so skip them. + */ + if (!(shdr.sh_flags & SHF_ALLOC)) + continue; + secstrs = secstrs_sym; /* @@ -1262,11 +1301,20 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss, goto out_elf_end; } else if ((used_opd && runtime_ss->adjust_symbols) || (!used_opd && syms_ss->adjust_symbols)) { + GElf_Phdr phdr; + + if (elf_read_program_header(syms_ss->elf, + (u64)sym.st_value, &phdr)) { + pr_warning("%s: failed to find program header for " + "symbol: %s st_value: %#" PRIx64 "\n", + __func__, elf_name, (u64)sym.st_value); + continue; + } pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " " - "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__, - (u64)sym.st_value, (u64)shdr.sh_addr, - (u64)shdr.sh_offset); - sym.st_value -= shdr.sh_addr - shdr.sh_offset; + "p_vaddr: %#" PRIx64 " p_offset: %#" PRIx64 "\n", + __func__, (u64)sym.st_value, (u64)phdr.p_vaddr, + (u64)phdr.p_offset); + sym.st_value -= phdr.p_vaddr - phdr.p_offset; } demangled = demangle_sym(dso, kmodule, elf_name); diff --git a/tools/power/pm-graph/README b/tools/power/pm-graph/README index da468bd510ca..e6020c0d59ec 100644 --- a/tools/power/pm-graph/README +++ b/tools/power/pm-graph/README @@ -6,7 +6,7 @@ |_| |___/ |_| pm-graph: suspend/resume/boot timing analysis tools - Version: 5.8 + Version: 5.9 Author: Todd Brandt <todd.e.brandt@intel.com> Home Page: https://01.org/pm-graph @@ -97,8 +97,8 @@ (kernel/pre-3.15/enable_trace_events_suspend_resume.patch) (kernel/pre-3.15/enable_trace_events_device_pm_callback.patch) - If you're using a kernel older than 3.15.0, the following - additional kernel parameters are required: + If you're using bootgraph, or sleepgraph with a kernel older than 3.15.0, + the following additional kernel parameters are required: (e.g. in file /etc/default/grub) GRUB_CMDLINE_LINUX_DEFAULT="... initcall_debug log_buf_len=32M ..." diff --git a/tools/power/pm-graph/bootgraph.py b/tools/power/pm-graph/bootgraph.py index 2823cd3122f7..f96f50e0c336 100755 --- a/tools/power/pm-graph/bootgraph.py +++ b/tools/power/pm-graph/bootgraph.py @@ -69,22 +69,24 @@ class SystemValues(aslib.SystemValues): bootloader = 'grub' blexec = [] def __init__(self): - self.hostname = platform.node() + self.kernel, self.hostname = 'unknown', platform.node() self.testtime = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') if os.path.exists('/proc/version'): fp = open('/proc/version', 'r') - val = fp.read().strip() + self.kernel = self.kernelVersion(fp.read().strip()) fp.close() - self.kernel = self.kernelVersion(val) - else: - self.kernel = 'unknown' self.testdir = datetime.now().strftime('boot-%y%m%d-%H%M%S') def kernelVersion(self, msg): - return msg.split()[2] + m = re.match('^[Ll]inux *[Vv]ersion *(?P<v>\S*) .*', msg) + if m: + return m.group('v') + return 'unknown' def checkFtraceKernelVersion(self): - val = tuple(map(int, self.kernel.split('-')[0].split('.'))) - if val >= (4, 10, 0): - return True + m = re.match('^(?P<x>[0-9]*)\.(?P<y>[0-9]*)\.(?P<z>[0-9]*).*', self.kernel) + if m: + val = tuple(map(int, m.groups())) + if val >= (4, 10, 0): + return True return False def kernelParams(self): cmdline = 'initcall_debug log_buf_len=32M' diff --git a/tools/power/pm-graph/config/custom-timeline-functions.cfg b/tools/power/pm-graph/config/custom-timeline-functions.cfg index 962e5768681c..4f80ad7d7275 100644 --- a/tools/power/pm-graph/config/custom-timeline-functions.cfg +++ b/tools/power/pm-graph/config/custom-timeline-functions.cfg @@ -125,7 +125,7 @@ acpi_suspend_begin: suspend_console: acpi_pm_prepare: syscore_suspend: -arch_thaw_secondary_cpus_end: +arch_enable_nonboot_cpus_end: syscore_resume: acpi_pm_finish: resume_console: diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index ffd50953a024..33981adcdd68 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -66,8 +66,13 @@ from threading import Thread from subprocess import call, Popen, PIPE import base64 +debugtiming = False +mystarttime = time.time() def pprint(msg): - print(msg) + if debugtiming: + print('[%09.3f] %s' % (time.time()-mystarttime, msg)) + else: + print(msg) sys.stdout.flush() def ascii(text): @@ -81,13 +86,14 @@ def ascii(text): # store system values and test parameters class SystemValues: title = 'SleepGraph' - version = '5.8' + version = '5.9' ansi = False rs = 0 display = '' gzip = False sync = False wifi = False + netfix = False verbose = False testlog = True dmesglog = True @@ -108,6 +114,7 @@ class SystemValues: cpucount = 0 memtotal = 204800 memfree = 204800 + osversion = '' srgap = 0 cgexp = False testdir = '' @@ -116,6 +123,7 @@ class SystemValues: fpdtpath = '/sys/firmware/acpi/tables/FPDT' epath = '/sys/kernel/debug/tracing/events/power/' pmdpath = '/sys/power/pm_debug_messages' + s0ixpath = '/sys/module/intel_pmc_core/parameters/warn_on_s0ix_failures' acpipath='/sys/module/acpi/parameters/debug_level' traceevents = [ 'suspend_resume', @@ -156,6 +164,7 @@ class SystemValues: ftop = False usetraceevents = False usetracemarkers = True + useftrace = True usekprobes = True usedevsrc = False useprocmon = False @@ -279,10 +288,16 @@ class SystemValues: 'intel_fbdev_set_suspend': {}, } infocmds = [ + [0, 'sysinfo', 'uname', '-a'], + [0, 'cpuinfo', 'head', '-7', '/proc/cpuinfo'], [0, 'kparams', 'cat', '/proc/cmdline'], [0, 'mcelog', 'mcelog'], [0, 'pcidevices', 'lspci', '-tv'], - [0, 'usbdevices', 'lsusb', '-t'], + [0, 'usbdevices', 'lsusb', '-tv'], + [0, 'acpidevices', 'sh', '-c', 'ls -l /sys/bus/acpi/devices/*/physical_node'], + [0, 's0ix_require', 'cat', '/sys/kernel/debug/pmc_core/substate_requirements'], + [0, 's0ix_debug', 'cat', '/sys/kernel/debug/pmc_core/slp_s0_debug_status'], + [1, 's0ix_residency', 'cat', '/sys/kernel/debug/pmc_core/slp_s0_residency_usec'], [1, 'interrupts', 'cat', '/proc/interrupts'], [1, 'wakeups', 'cat', '/sys/kernel/debug/wakeup_sources'], [2, 'gpecounts', 'sh', '-c', 'grep -v invalid /sys/firmware/acpi/interrupts/*'], @@ -358,8 +373,19 @@ class SystemValues: self.outputResult({'error':msg}) sys.exit(1) return False - def usable(self, file): - return (os.path.exists(file) and os.path.getsize(file) > 0) + def usable(self, file, ishtml=False): + if not os.path.exists(file) or os.path.getsize(file) < 1: + return False + if ishtml: + try: + fp = open(file, 'r') + res = fp.read(1000) + fp.close() + except: + return False + if '<html>' not in res: + return False + return True def getExec(self, cmd): try: fp = Popen(['which', cmd], stdout=PIPE, stderr=PIPE).stdout @@ -413,12 +439,16 @@ class SystemValues: r = info['bios-release-date'] if 'bios-release-date' in info else '' self.sysstamp = '# sysinfo | man:%s | plat:%s | cpu:%s | bios:%s | biosdate:%s | numcpu:%d | memsz:%d | memfr:%d' % \ (m, p, c, b, r, self.cpucount, self.memtotal, self.memfree) + if self.osversion: + self.sysstamp += ' | os:%s' % self.osversion def printSystemInfo(self, fatal=False): self.rootCheck(True) out = dmidecode(self.mempath, fatal) if len(out) < 1: return fmt = '%-24s: %s' + if self.osversion: + print(fmt % ('os-version', self.osversion)) for name in sorted(out): print(fmt % (name, out[name])) print(fmt % ('cpucount', ('%d' % self.cpucount))) @@ -426,20 +456,25 @@ class SystemValues: print(fmt % ('memfree', ('%d kB' % self.memfree))) def cpuInfo(self): self.cpucount = 0 - fp = open('/proc/cpuinfo', 'r') - for line in fp: - if re.match('^processor[ \t]*:[ \t]*[0-9]*', line): - self.cpucount += 1 - fp.close() - fp = open('/proc/meminfo', 'r') - for line in fp: - m = re.match('^MemTotal:[ \t]*(?P<sz>[0-9]*) *kB', line) - if m: - self.memtotal = int(m.group('sz')) - m = re.match('^MemFree:[ \t]*(?P<sz>[0-9]*) *kB', line) - if m: - self.memfree = int(m.group('sz')) - fp.close() + if os.path.exists('/proc/cpuinfo'): + with open('/proc/cpuinfo', 'r') as fp: + for line in fp: + if re.match('^processor[ \t]*:[ \t]*[0-9]*', line): + self.cpucount += 1 + if os.path.exists('/proc/meminfo'): + with open('/proc/meminfo', 'r') as fp: + for line in fp: + m = re.match('^MemTotal:[ \t]*(?P<sz>[0-9]*) *kB', line) + if m: + self.memtotal = int(m.group('sz')) + m = re.match('^MemFree:[ \t]*(?P<sz>[0-9]*) *kB', line) + if m: + self.memfree = int(m.group('sz')) + if os.path.exists('/etc/os-release'): + with open('/etc/os-release', 'r') as fp: + for line in fp: + if line.startswith('PRETTY_NAME='): + self.osversion = line[12:].strip().replace('"', '') def initTestOutput(self, name): self.prefix = self.hostname v = open('/proc/version', 'r').read().strip() @@ -698,6 +733,8 @@ class SystemValues: return False return True def fsetVal(self, val, path): + if not self.useftrace: + return False return self.setVal(val, self.tpath+path) def getVal(self, file): res = '' @@ -711,9 +748,11 @@ class SystemValues: pass return res def fgetVal(self, path): + if not self.useftrace: + return '' return self.getVal(self.tpath+path) def cleanupFtrace(self): - if(self.usecallgraph or self.usetraceevents or self.usedevsrc): + if self.useftrace: self.fsetVal('0', 'events/kprobes/enable') self.fsetVal('', 'kprobe_events') self.fsetVal('1024', 'buffer_size_kb') @@ -734,13 +773,14 @@ class SystemValues: return True return False def initFtrace(self, quiet=False): + if not self.useftrace: + return if not quiet: sysvals.printSystemInfo(False) pprint('INITIALIZING FTRACE...') # turn trace off self.fsetVal('0', 'tracing_on') self.cleanupFtrace() - self.testVal(self.pmdpath, 'basic', '1') # set the trace clock to global self.fsetVal('global', 'trace_clock') self.fsetVal('nop', 'current_tracer') @@ -766,6 +806,10 @@ class SystemValues: # set trace type self.fsetVal('function_graph', 'current_tracer') self.fsetVal('', 'set_ftrace_filter') + # temporary hack to fix https://bugzilla.kernel.org/show_bug.cgi?id=212761 + fp = open(self.tpath+'set_ftrace_notrace', 'w') + fp.write('native_queued_spin_lock_slowpath\ndev_driver_string') + fp.close() # set trace format options self.fsetVal('print-parent', 'trace_options') self.fsetVal('funcgraph-abstime', 'trace_options') @@ -846,6 +890,8 @@ class SystemValues: fp.write('# turbostat %s\n' % test['turbo']) if 'wifi' in test: fp.write('# wifi %s\n' % test['wifi']) + if 'netfix' in test: + fp.write('# netfix %s\n' % test['netfix']) if test['error'] or len(testdata) > 1: fp.write('# enter_sleep_error %s\n' % test['error']) return fp @@ -865,6 +911,8 @@ class SystemValues: fp.write('error%s: %s\n' % (n, testdata['error'])) else: fp.write('result%s: pass\n' % n) + if 'mode' in testdata: + fp.write('mode%s: %s\n' % (n, testdata['mode'])) for v in ['suspend', 'resume', 'boot', 'lastinit']: if v in testdata: fp.write('%s%s: %.3f\n' % (v, n, testdata[v])) @@ -901,6 +949,8 @@ class SystemValues: fp.write(text) fp.close() def dlog(self, text): + if not self.dmesgfile: + return self.putlog(self.dmesgfile, '# %s\n' % text) def flog(self, text): self.putlog(self.ftracefile, text) @@ -954,34 +1004,31 @@ class SystemValues: dirname = props[dev].syspath if not dirname or not os.path.exists(dirname): continue - with open(dirname+'/power/async') as fp: - text = fp.read() - props[dev].isasync = False - if 'enabled' in text: + props[dev].isasync = False + if os.path.exists(dirname+'/power/async'): + fp = open(dirname+'/power/async') + if 'enabled' in fp.read(): props[dev].isasync = True + fp.close() fields = os.listdir(dirname) - if 'product' in fields: - with open(dirname+'/product', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'name' in fields: - with open(dirname+'/name', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'model' in fields: - with open(dirname+'/model', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'description' in fields: - with open(dirname+'/description', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'id' in fields: - with open(dirname+'/id', 'rb') as fp: - props[dev].altname = ascii(fp.read()) - elif 'idVendor' in fields and 'idProduct' in fields: - idv, idp = '', '' - with open(dirname+'/idVendor', 'rb') as fp: - idv = ascii(fp.read()).strip() - with open(dirname+'/idProduct', 'rb') as fp: - idp = ascii(fp.read()).strip() - props[dev].altname = '%s:%s' % (idv, idp) + for file in ['product', 'name', 'model', 'description', 'id', 'idVendor']: + if file not in fields: + continue + try: + with open(os.path.join(dirname, file), 'rb') as fp: + props[dev].altname = ascii(fp.read()) + except: + continue + if file == 'idVendor': + idv, idp = props[dev].altname.strip(), '' + try: + with open(os.path.join(dirname, 'idProduct'), 'rb') as fp: + idp = ascii(fp.read()).strip() + except: + props[dev].altname = '' + break + props[dev].altname = '%s:%s' % (idv, idp) + break if props[dev].altname: out = props[dev].altname.strip().replace('\n', ' ')\ .replace(',', ' ').replace(';', ' ') @@ -1047,7 +1094,7 @@ class SystemValues: self.cmd1[name] = self.dictify(info, delta) elif not debug and delta and name in self.cmd1: before, after = self.cmd1[name], self.dictify(info, delta) - dinfo = ('\t%s\n' % before['@']) if '@' in before else '' + dinfo = ('\t%s\n' % before['@']) if '@' in before and len(before) > 1 else '' prefix = self.commonPrefix(list(before.keys())) for key in sorted(before): if key in after and before[key] != after[key]: @@ -1128,6 +1175,22 @@ class SystemValues: val = valline[idx] out.append('%s=%s' % (key, val)) return '|'.join(out) + def netfixon(self, net='both'): + cmd = self.getExec('netfix') + if not cmd: + return '' + fp = Popen([cmd, '-s', net, 'on'], stdout=PIPE, stderr=PIPE).stdout + out = ascii(fp.read()).strip() + fp.close() + return out + def wifiRepair(self): + out = self.netfixon('wifi') + if not out or 'error' in out.lower(): + return '' + m = re.match('WIFI \S* ONLINE (?P<action>\S*)', out) + if not m: + return 'dead' + return m.group('action') def wifiDetails(self, dev): try: info = open('/sys/class/net/%s/device/uevent' % dev, 'r').read().strip() @@ -1144,12 +1207,12 @@ class SystemValues: except: return '' for line in reversed(w.split('\n')): - m = re.match(' *(?P<dev>.*): (?P<stat>[0-9a-f]*) .*', w.split('\n')[-1]) + m = re.match(' *(?P<dev>.*): (?P<stat>[0-9a-f]*) .*', line) if not m or (dev and dev != m.group('dev')): continue return m.group('dev') return '' - def pollWifi(self, dev, timeout=60): + def pollWifi(self, dev, timeout=10): start = time.time() while (time.time() - start) < timeout: w = self.checkWifi(dev) @@ -1157,6 +1220,11 @@ class SystemValues: return '%s reconnected %.2f' % \ (self.wifiDetails(dev), max(0, time.time() - start)) time.sleep(0.01) + if self.netfix: + res = self.wifiRepair() + if res: + timeout = max(0, time.time() - start) + return '%s %s %d' % (self.wifiDetails(dev), res, timeout) return '%s timeout %d' % (self.wifiDetails(dev), timeout) def errorSummary(self, errinfo, msg): found = False @@ -1283,10 +1351,10 @@ sysvals = SystemValues() switchvalues = ['enable', 'disable', 'on', 'off', 'true', 'false', '1', '0'] switchoff = ['disable', 'off', 'false', '0'] suspendmodename = { - 'freeze': 'Freeze (S0)', - 'standby': 'Standby (S1)', - 'mem': 'Suspend (S3)', - 'disk': 'Hibernate (S4)' + 'standby': 'standby (S1)', + 'freeze': 'freeze (S2idle)', + 'mem': 'suspend (S3)', + 'disk': 'hibernate (S4)' } # Class: DevProps @@ -1376,6 +1444,7 @@ class Data: 'INVALID' : r'(?i).*\bINVALID\b.*', 'CRASH' : r'(?i).*\bCRASHED\b.*', 'TIMEOUT' : r'(?i).*\bTIMEOUT\b.*', + 'ABORT' : r'(?i).*\bABORT\b.*', 'IRQ' : r'.*\bgenirq: .*', 'TASKFAIL': r'.*Freezing of tasks *.*', 'ACPI' : r'.*\bACPI *(?P<b>[A-Za-z]*) *Error[: ].*', @@ -1724,9 +1793,9 @@ class Data: if 'waking' in self.dmesg[lp]: tCnt = self.dmesg[lp]['waking'][0] if self.dmesg[lp]['waking'][1] >= 0.001: - tTry = '-%.0f' % (round(self.dmesg[lp]['waking'][1] * 1000)) + tTry = '%.0f' % (round(self.dmesg[lp]['waking'][1] * 1000)) else: - tTry = '-%.3f' % (self.dmesg[lp]['waking'][1] * 1000) + tTry = '%.3f' % (self.dmesg[lp]['waking'][1] * 1000) text = '%.0f (%s ms waking %d times)' % (tL * 1000, tTry, tCnt) else: text = '%.0f' % (tL * 1000) @@ -2107,6 +2176,30 @@ class Data: # set resume complete to end at end marker if 'resume_complete' in dm: dm['resume_complete']['end'] = time + def initcall_debug_call(self, line, quick=False): + m = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) .* (?P<f>.*)\: '+\ + 'PM: *calling .* @ (?P<n>.*), parent: (?P<p>.*)', line) + if not m: + m = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) .* (?P<f>.*)\: '+\ + 'calling .* @ (?P<n>.*), parent: (?P<p>.*)', line) + if not m: + m = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) calling '+\ + '(?P<f>.*)\+ @ (?P<n>.*), parent: (?P<p>.*)', line) + if m: + return True if quick else m.group('t', 'f', 'n', 'p') + return False if quick else ('', '', '', '') + def initcall_debug_return(self, line, quick=False): + m = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) .* (?P<f>.*)\: PM: '+\ + '.* returned (?P<r>[0-9]*) after (?P<dt>[0-9]*) usecs', line) + if not m: + m = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) .* (?P<f>.*)\: '+\ + '.* returned (?P<r>[0-9]*) after (?P<dt>[0-9]*) usecs', line) + if not m: + m = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) call '+\ + '(?P<f>.*)\+ returned .* after (?P<dt>.*) usecs', line) + if m: + return True if quick else m.group('t', 'f', 'dt') + return False if quick else ('', '', '') def debugPrint(self): for p in self.sortedPhases(): list = self.dmesg[p]['list'] @@ -2880,10 +2973,11 @@ class TestProps: cmdlinefmt = '^# command \| (?P<cmd>.*)' kparamsfmt = '^# kparams \| (?P<kp>.*)' devpropfmt = '# Device Properties: .*' - pinfofmt = '# platform-(?P<val>[a-z,A-Z,0-9]*): (?P<info>.*)' + pinfofmt = '# platform-(?P<val>[a-z,A-Z,0-9,_]*): (?P<info>.*)' tracertypefmt = '# tracer: (?P<t>.*)' firmwarefmt = '# fwsuspend (?P<s>[0-9]*) fwresume (?P<r>[0-9]*)$' procexecfmt = 'ps - (?P<ps>.*)$' + procmultifmt = '@(?P<n>[0-9]*)\|(?P<ps>.*)$' ftrace_line_fmt_fg = \ '^ *(?P<time>[0-9\.]*) *\| *(?P<cpu>[0-9]*)\)'+\ ' *(?P<proc>.*)-(?P<pid>[0-9]*) *\|'+\ @@ -2893,6 +2987,9 @@ class TestProps: '(?P<flags>\S*) *(?P<time>[0-9\.]*): *'+\ '(?P<msg>.*)' machinesuspend = 'machine_suspend\[.*' + multiproclist = dict() + multiproctime = 0.0 + multiproccnt = 0 def __init__(self): self.stamp = '' self.sysinfo = '' @@ -3063,6 +3160,7 @@ class TestRun: self.ttemp = dict() class ProcessMonitor: + maxchars = 512 def __init__(self): self.proclist = dict() self.running = False @@ -3088,19 +3186,23 @@ class ProcessMonitor: if ujiff > 0 or kjiff > 0: running[pid] = ujiff + kjiff process.wait() - out = '' + out = [''] for pid in running: jiffies = running[pid] val = self.proclist[pid] - if out: - out += ',' - out += '%s-%s %d' % (val['name'], pid, jiffies) - return 'ps - '+out + if len(out[-1]) > self.maxchars: + out.append('') + elif len(out[-1]) > 0: + out[-1] += ',' + out[-1] += '%s-%s %d' % (val['name'], pid, jiffies) + if len(out) > 1: + for line in out: + sysvals.fsetVal('ps - @%d|%s' % (len(out), line), 'trace_marker') + else: + sysvals.fsetVal('ps - %s' % out[0], 'trace_marker') def processMonitor(self, tid): while self.running: - out = self.procstat() - if out: - sysvals.fsetVal(out, 'trace_marker') + self.procstat() def start(self): self.thread = Thread(target=self.processMonitor, args=(0,)) self.running = True @@ -3144,7 +3246,6 @@ def doesTraceLogHaveTraceEvents(): # Function: appendIncompleteTraceLog # Description: -# [deprecated for kernel 3.15 or newer] # Adds callgraph data which lacks trace event data. This is only # for timelines generated from 3.15 or older # Arguments: @@ -3246,6 +3347,61 @@ def appendIncompleteTraceLog(testruns): dev['ftrace'] = cg break +# Function: loadTraceLog +# Description: +# load the ftrace file into memory and fix up any ordering issues +# Output: +# TestProps instance and an array of lines in proper order +def loadTraceLog(): + tp, data, lines, trace = TestProps(), dict(), [], [] + tf = sysvals.openlog(sysvals.ftracefile, 'r') + for line in tf: + # remove any latent carriage returns + line = line.replace('\r\n', '') + if tp.stampInfo(line, sysvals): + continue + # ignore all other commented lines + if line[0] == '#': + continue + # ftrace line: parse only valid lines + m = re.match(tp.ftrace_line_fmt, line) + if(not m): + continue + dur = m.group('dur') if tp.cgformat else 'traceevent' + info = (m.group('time'), m.group('proc'), m.group('pid'), + m.group('msg'), dur) + # group the data by timestamp + t = float(info[0]) + if t in data: + data[t].append(info) + else: + data[t] = [info] + # we only care about trace event ordering + if (info[3].startswith('suspend_resume:') or \ + info[3].startswith('tracing_mark_write:')) and t not in trace: + trace.append(t) + tf.close() + for t in sorted(data): + first, last, blk = [], [], data[t] + if len(blk) > 1 and t in trace: + # move certain lines to the start or end of a timestamp block + for i in range(len(blk)): + if 'SUSPEND START' in blk[i][3]: + first.append(i) + elif re.match('.* timekeeping_freeze.*begin', blk[i][3]): + last.append(i) + elif re.match('.* timekeeping_freeze.*end', blk[i][3]): + first.append(i) + elif 'RESUME COMPLETE' in blk[i][3]: + last.append(i) + if len(first) == 1 and len(last) == 0: + blk.insert(0, blk.pop(first[0])) + elif len(last) == 1 and len(first) == 0: + blk.append(blk.pop(last[0])) + for info in blk: + lines.append(info) + return (tp, lines) + # Function: parseTraceLog # Description: # Analyze an ftrace log output file generated from this app during @@ -3271,32 +3427,12 @@ def parseTraceLog(live=False): # extract the callgraph and traceevent data s2idle_enter = hwsus = False - tp = TestProps() testruns, testdata = [], [] testrun, data, limbo = 0, 0, True - tf = sysvals.openlog(sysvals.ftracefile, 'r') phase = 'suspend_prepare' - for line in tf: - # remove any latent carriage returns - line = line.replace('\r\n', '') - if tp.stampInfo(line, sysvals): - continue - # ignore all other commented lines - if line[0] == '#': - continue - # ftrace line: parse only valid lines - m = re.match(tp.ftrace_line_fmt, line) - if(not m): - continue + tp, tf = loadTraceLog() + for m_time, m_proc, m_pid, m_msg, m_param3 in tf: # gather the basic message data from the line - m_time = m.group('time') - m_proc = m.group('proc') - m_pid = m.group('pid') - m_msg = m.group('msg') - if(tp.cgformat): - m_param3 = m.group('dur') - else: - m_param3 = 'traceevent' if(m_time and m_pid and m_msg): t = FTraceLine(m_time, m_msg, m_param3) pid = int(m_pid) @@ -3322,14 +3458,29 @@ def parseTraceLog(live=False): if t.type == 'tracing_mark_write': m = re.match(tp.procexecfmt, t.name) if(m): - proclist = dict() - for ps in m.group('ps').split(','): + parts, msg = 1, m.group('ps') + m = re.match(tp.procmultifmt, msg) + if(m): + parts, msg = int(m.group('n')), m.group('ps') + if tp.multiproccnt == 0: + tp.multiproctime = t.time + tp.multiproclist = dict() + proclist = tp.multiproclist + tp.multiproccnt += 1 + else: + proclist = dict() + tp.multiproccnt = 0 + for ps in msg.split(','): val = ps.split() - if not val: + if not val or len(val) != 2: continue name = val[0].replace('--', '-') proclist[name] = int(val[1]) - data.pstl[t.time] = proclist + if parts == 1: + data.pstl[t.time] = proclist + elif parts == tp.multiproccnt: + data.pstl[tp.multiproctime] = proclist + tp.multiproccnt = 0 continue # find the end of resume if(t.endMarker()): @@ -3545,7 +3696,6 @@ def parseTraceLog(live=False): testrun.ftemp[key].append(FTraceCallGraph(pid, sysvals)) if(res == -1): testrun.ftemp[key][-1].addLine(t) - tf.close() if len(testdata) < 1: sysvals.vprint('WARNING: ftrace start marker is missing') if data and not data.devicegroups: @@ -3667,7 +3817,13 @@ def parseTraceLog(live=False): if p not in data.dmesg: if not terr: ph = p if 'machine' in p else lp - terr = '%s%s failed in %s phase' % (sysvals.suspendmode, tn, ph) + if p == 'suspend_machine': + sm = sysvals.suspendmode + if sm in suspendmodename: + sm = suspendmodename[sm] + terr = 'test%s did not enter %s power mode' % (tn, sm) + else: + terr = '%s%s failed in %s phase' % (sysvals.suspendmode, tn, ph) pprint('TEST%s FAILED: %s' % (tn, terr)) error.append(terr) if data.tSuspended == 0: @@ -3708,9 +3864,7 @@ def parseTraceLog(live=False): # Function: loadKernelLog # Description: -# [deprecated for kernel 3.15.0 or newer] # load the dmesg file into memory and fix up any ordering issues -# The dmesg filename is taken from sysvals # Output: # An array of empty Data objects with only their dmesgtext attributes set def loadKernelLog(): @@ -3736,7 +3890,8 @@ def loadKernelLog(): if(not m): continue msg = m.group("msg") - if(re.match('PM: Syncing filesystems.*', msg)): + if re.match('PM: Syncing filesystems.*', msg) or \ + re.match('PM: suspend entry.*', msg): if(data): testruns.append(data) data = Data(len(testruns)) @@ -3747,11 +3902,17 @@ def loadKernelLog(): if(m): sysvals.stamp['kernel'] = m.group('k') m = re.match('PM: Preparing system for (?P<m>.*) sleep', msg) - if(m): + if not m: + m = re.match('PM: Preparing system for sleep \((?P<m>.*)\)', msg) + if m: sysvals.stamp['mode'] = sysvals.suspendmode = m.group('m') data.dmesgtext.append(line) lf.close() + if sysvals.suspendmode == 's2idle': + sysvals.suspendmode = 'freeze' + elif sysvals.suspendmode == 'deep': + sysvals.suspendmode = 'mem' if data: testruns.append(data) if len(testruns) < 1: @@ -3762,12 +3923,9 @@ def loadKernelLog(): for data in testruns: last = '' for line in data.dmesgtext: - mc = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) calling '+\ - '(?P<f>.*)\+ @ .*, parent: .*', line) - mr = re.match('.*(\[ *)(?P<t>[0-9\.]*)(\]) call '+\ - '(?P<f>.*)\+ returned .* after (?P<dt>.*) usecs', last) - if(mc and mr and (mc.group('t') == mr.group('t')) and - (mc.group('f') == mr.group('f'))): + ct, cf, n, p = data.initcall_debug_call(line) + rt, rf, l = data.initcall_debug_return(last) + if ct and rt and ct == rt and cf == rf: i = data.dmesgtext.index(last) j = data.dmesgtext.index(line) data.dmesgtext[i] = line @@ -3777,7 +3935,6 @@ def loadKernelLog(): # Function: parseKernelLog # Description: -# [deprecated for kernel 3.15.0 or newer] # Analyse a dmesg log output file generated from this app during # the execution phase. Create a set of device structures in memory # for subsequent formatting in the html output file @@ -3796,30 +3953,30 @@ def parseKernelLog(data): # dmesg phase match table dm = { - 'suspend_prepare': ['PM: Syncing filesystems.*'], - 'suspend': ['PM: Entering [a-z]* sleep.*', 'Suspending console.*'], - 'suspend_late': ['PM: suspend of devices complete after.*'], - 'suspend_noirq': ['PM: late suspend of devices complete after.*'], - 'suspend_machine': ['PM: noirq suspend of devices complete after.*'], - 'resume_machine': ['ACPI: Low-level resume complete.*'], - 'resume_noirq': ['ACPI: Waking up from system sleep state.*'], - 'resume_early': ['PM: noirq resume of devices complete after.*'], - 'resume': ['PM: early resume of devices complete after.*'], - 'resume_complete': ['PM: resume of devices complete after.*'], + 'suspend_prepare': ['PM: Syncing filesystems.*', 'PM: suspend entry.*'], + 'suspend': ['PM: Entering [a-z]* sleep.*', 'Suspending console.*', + 'PM: Suspending system .*'], + 'suspend_late': ['PM: suspend of devices complete after.*', + 'PM: freeze of devices complete after.*'], + 'suspend_noirq': ['PM: late suspend of devices complete after.*', + 'PM: late freeze of devices complete after.*'], + 'suspend_machine': ['PM: suspend-to-idle', + 'PM: noirq suspend of devices complete after.*', + 'PM: noirq freeze of devices complete after.*'], + 'resume_machine': ['PM: Timekeeping suspended for.*', + 'ACPI: Low-level resume complete.*', + 'ACPI: resume from mwait', + 'Suspended for [0-9\.]* seconds'], + 'resume_noirq': ['PM: resume from suspend-to-idle', + 'ACPI: Waking up from system sleep state.*'], + 'resume_early': ['PM: noirq resume of devices complete after.*', + 'PM: noirq restore of devices complete after.*'], + 'resume': ['PM: early resume of devices complete after.*', + 'PM: early restore of devices complete after.*'], + 'resume_complete': ['PM: resume of devices complete after.*', + 'PM: restore of devices complete after.*'], 'post_resume': ['.*Restarting tasks \.\.\..*'], } - if(sysvals.suspendmode == 'standby'): - dm['resume_machine'] = ['PM: Restoring platform NVS memory'] - elif(sysvals.suspendmode == 'disk'): - dm['suspend_late'] = ['PM: freeze of devices complete after.*'] - dm['suspend_noirq'] = ['PM: late freeze of devices complete after.*'] - dm['suspend_machine'] = ['PM: noirq freeze of devices complete after.*'] - dm['resume_machine'] = ['PM: Restoring platform NVS memory'] - dm['resume_early'] = ['PM: noirq restore of devices complete after.*'] - dm['resume'] = ['PM: early restore of devices complete after.*'] - dm['resume_complete'] = ['PM: restore of devices complete after.*'] - elif(sysvals.suspendmode == 'freeze'): - dm['resume_machine'] = ['ACPI: resume from mwait'] # action table (expected events that occur and show up in dmesg) at = { @@ -3867,12 +4024,13 @@ def parseKernelLog(data): for s in dm[p]: if(re.match(s, msg)): phasechange, phase = True, p + dm[p] = [s] break # hack for determining resume_machine end for freeze if(not sysvals.usetraceevents and sysvals.suspendmode == 'freeze' \ and phase == 'resume_machine' and \ - re.match('calling (?P<f>.*)\+ @ .*, parent: .*', msg)): + data.initcall_debug_call(line, True)): data.setPhase(phase, ktime, False) phase = 'resume_noirq' data.setPhase(phase, ktime, True) @@ -3945,26 +4103,18 @@ def parseKernelLog(data): # -- device callbacks -- if(phase in data.sortedPhases()): # device init call - if(re.match('calling (?P<f>.*)\+ @ .*, parent: .*', msg)): - sm = re.match('calling (?P<f>.*)\+ @ '+\ - '(?P<n>.*), parent: (?P<p>.*)', msg); - f = sm.group('f') - n = sm.group('n') - p = sm.group('p') - if(f and n and p): - data.newAction(phase, f, int(n), p, ktime, -1, '') - # device init return - elif(re.match('call (?P<f>.*)\+ returned .* after '+\ - '(?P<t>.*) usecs', msg)): - sm = re.match('call (?P<f>.*)\+ returned .* after '+\ - '(?P<t>.*) usecs(?P<a>.*)', msg); - f = sm.group('f') - t = sm.group('t') - list = data.dmesg[phase]['list'] - if(f in list): - dev = list[f] - dev['length'] = int(t) - dev['end'] = ktime + t, f, n, p = data.initcall_debug_call(line) + if t and f and n and p: + data.newAction(phase, f, int(n), p, ktime, -1, '') + else: + # device init return + t, f, l = data.initcall_debug_return(line) + if t and f and l: + list = data.dmesg[phase]['list'] + if(f in list): + dev = list[f] + dev['length'] = int(l) + dev['end'] = ktime # if trace events are not available, these are better than nothing if(not sysvals.usetraceevents): @@ -4006,6 +4156,8 @@ def parseKernelLog(data): # fill in any missing phases phasedef = data.phasedef terr, lp = '', 'suspend_prepare' + if lp not in data.dmesg: + doError('dmesg log format has changed, could not find start of suspend') for p in sorted(phasedef, key=lambda k:phasedef[k]['order']): if p not in data.dmesg: if not terr: @@ -5302,7 +5454,7 @@ def executeSuspend(quiet=False): sv.dlog('read dmesg') sv.initdmesg() # start ftrace - if(sv.usecallgraph or sv.usetraceevents): + if sv.useftrace: if not quiet: pprint('START TRACING') sv.dlog('start ftrace tracing') @@ -5334,8 +5486,7 @@ def executeSuspend(quiet=False): sv.dlog('enable RTC wake alarm') sv.rtcWakeAlarmOn() # start of suspend trace marker - if(sv.usecallgraph or sv.usetraceevents): - sv.fsetVal(datetime.now().strftime(sv.tmstart), 'trace_marker') + sv.fsetVal(datetime.now().strftime(sv.tmstart), 'trace_marker') # predelay delay if(count == 1 and sv.predelay > 0): sv.fsetVal('WAIT %d' % sv.predelay, 'trace_marker') @@ -5384,11 +5535,17 @@ def executeSuspend(quiet=False): sv.fsetVal('WAIT END', 'trace_marker') # return from suspend pprint('RESUME COMPLETE') - if(sv.usecallgraph or sv.usetraceevents): - sv.fsetVal(datetime.now().strftime(sv.tmend), 'trace_marker') + sv.fsetVal(datetime.now().strftime(sv.tmend), 'trace_marker') if sv.wifi and wifi: tdata['wifi'] = sv.pollWifi(wifi) sv.dlog('wifi check, %s' % tdata['wifi']) + if sv.netfix: + netfixout = sv.netfixon('wired') + elif sv.netfix: + netfixout = sv.netfixon() + if sv.netfix and netfixout: + tdata['netfix'] = netfixout + sv.dlog('netfix, %s' % tdata['netfix']) if(sv.suspendmode == 'mem' or sv.suspendmode == 'command'): sv.dlog('read the ACPI FPDT') tdata['fw'] = getFPDT(False) @@ -5396,7 +5553,7 @@ def executeSuspend(quiet=False): sv.dlog('run the cmdinfo list after') cmdafter = sv.cmdinfo(False) # stop ftrace - if(sv.usecallgraph or sv.usetraceevents): + if sv.useftrace: if sv.useprocmon: sv.dlog('stop the process monitor') pm.stop() @@ -5407,7 +5564,7 @@ def executeSuspend(quiet=False): sysvals.dlog('EXECUTION TRACE END') sv.getdmesg(testdata) # grab a copy of the ftrace output - if(sv.usecallgraph or sv.usetraceevents): + if sv.useftrace: if not quiet: pprint('CAPTURING TRACE') op = sv.writeDatafileHeader(sv.ftracefile, testdata) @@ -5838,13 +5995,19 @@ def statusCheck(probecheck=False): pprint(' please choose one with -m') # check if ftrace is available - res = sysvals.colorText('NO') - ftgood = sysvals.verifyFtrace() - if(ftgood): - res = 'YES' - elif(sysvals.usecallgraph): - status = 'ftrace is not properly supported' - pprint(' is ftrace supported: %s' % res) + if sysvals.useftrace: + res = sysvals.colorText('NO') + sysvals.useftrace = sysvals.verifyFtrace() + efmt = '"{0}" uses ftrace, and it is not properly supported' + if sysvals.useftrace: + res = 'YES' + elif sysvals.usecallgraph: + status = efmt.format('-f') + elif sysvals.usedevsrc: + status = efmt.format('-dev') + elif sysvals.useprocmon: + status = efmt.format('-proc') + pprint(' is ftrace supported: %s' % res) # check if kprobes are available if sysvals.usekprobes: @@ -5857,8 +6020,8 @@ def statusCheck(probecheck=False): pprint(' are kprobes supported: %s' % res) # what data source are we using - res = 'DMESG' - if(ftgood): + res = 'DMESG (very limited, ftrace is preferred)' + if sysvals.useftrace: sysvals.usetraceevents = True for e in sysvals.traceevents: if not os.path.exists(sysvals.epath+e): @@ -5879,7 +6042,7 @@ def statusCheck(probecheck=False): pprint(' optional commands this tool may use for info:') no = sysvals.colorText('MISSING') yes = sysvals.colorText('FOUND', 32) - for c in ['turbostat', 'mcelog', 'lspci', 'lsusb']: + for c in ['turbostat', 'mcelog', 'lspci', 'lsusb', 'netfix']: if c == 'turbostat': res = yes if sysvals.haveTurbostat() else no else: @@ -5971,7 +6134,7 @@ def processData(live=False, quiet=False): if not sysvals.stamp: pprint('ERROR: data does not include the expected stamp') return (testruns, {'error': 'timeline generation failed'}) - shown = ['bios', 'biosdate', 'cpu', 'host', 'kernel', 'man', 'memfr', + shown = ['os', 'bios', 'biosdate', 'cpu', 'host', 'kernel', 'man', 'memfr', 'memsz', 'mode', 'numcpu', 'plat', 'time', 'wifi'] sysvals.vprint('System Info:') for key in sorted(sysvals.stamp): @@ -6052,6 +6215,8 @@ def runTest(n=0, quiet=False): if sysvals.display: ret = sysvals.displayControl('init') sysvals.dlog('xset display init, ret = %d' % ret) + sysvals.testVal(sysvals.pmdpath, 'basic', '1') + sysvals.testVal(sysvals.s0ixpath, 'basic', 'Y') sysvals.dlog('initialize ftrace') sysvals.initFtrace(quiet) @@ -6145,9 +6310,12 @@ def data_from_html(file, outpath, issues, fulldetail=False): elist[err[0]] += 1 for i in elist: ilist.append('%sx%d' % (i, elist[i]) if elist[i] > 1 else i) - wifi = find_in_html(html, 'Wifi Resume: ', '</td>') - if wifi: - extra['wifi'] = wifi + line = find_in_html(log, '# wifi ', '\n') + if line: + extra['wifi'] = line + line = find_in_html(log, '# netfix ', '\n') + if line: + extra['netfix'] = line low = find_in_html(html, 'freeze time: <b>', ' ms</b>') for lowstr in ['waking', '+']: if not low: @@ -6243,7 +6411,7 @@ def genHtml(subdir, force=False): sysvals.ftracefile = file sysvals.setOutputFile() if (sysvals.dmesgfile or sysvals.ftracefile) and sysvals.htmlfile and \ - (force or not sysvals.usable(sysvals.htmlfile)): + (force or not sysvals.usable(sysvals.htmlfile, True)): pprint('FTRACE: %s' % sysvals.ftracefile) if sysvals.dmesgfile: pprint('DMESG : %s' % sysvals.dmesgfile) @@ -6533,6 +6701,7 @@ def printHelp(): ' -skiphtml Run the test and capture the trace logs, but skip the timeline (default: disabled)\n'\ ' -result fn Export a results table to a text file for parsing.\n'\ ' -wifi If a wifi connection is available, check that it reconnects after resume.\n'\ + ' -netfix Use netfix to reset the network in the event it fails to resume.\n'\ ' [testprep]\n'\ ' -sync Sync the filesystems before starting the test\n'\ ' -rs on/off Enable/disable runtime suspend for all devices, restore all after test\n'\ @@ -6615,6 +6784,8 @@ if __name__ == '__main__': elif(arg == '-v'): pprint("Version %s" % sysvals.version) sys.exit(0) + elif(arg == '-debugtiming'): + debugtiming = True elif(arg == '-x2'): sysvals.execcount = 2 elif(arg == '-x2delay'): @@ -6657,6 +6828,8 @@ if __name__ == '__main__': sysvals.sync = True elif(arg == '-wifi'): sysvals.wifi = True + elif(arg == '-netfix'): + sysvals.netfix = True elif(arg == '-gzip'): sysvals.gzip = True elif(arg == '-info'): @@ -6819,7 +6992,7 @@ if __name__ == '__main__': sysvals.outdir = val sysvals.notestrun = True if(os.path.isdir(val) == False): - doError('%s is not accessible' % val) + doError('%s is not accesible' % val) elif(arg == '-filter'): try: val = next(args) @@ -6942,12 +7115,11 @@ if __name__ == '__main__': time.sleep(sysvals.multitest['delay']) fmt = 'suspend-%y%m%d-%H%M%S' sysvals.testdir = os.path.join(sysvals.outdir, datetime.now().strftime(fmt)) - ret = runTest(i+1, True) + ret = runTest(i+1, not sysvals.verbose) failcnt = 0 if not ret else failcnt + 1 if sysvals.maxfail > 0 and failcnt >= sysvals.maxfail: pprint('Maximum fail count of %d reached, aborting multitest' % (sysvals.maxfail)) break - time.sleep(5) sysvals.resetlog() sysvals.multistat(False, i, finish) if 'time' in sysvals.multitest and datetime.now() >= finish: diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 1e7d3de55a94..c7b26a3603af 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -92,40 +92,66 @@ displays the statistics gathered since it was forked. .SH ROW DESCRIPTIONS The system configuration dump (if --quiet is not used) is followed by statistics. The first row of the statistics labels the content of each column (below). The second row of statistics is the system summary line. The system summary line has a '-' in the columns for the Package, Core, and CPU. The contents of the system summary line depends on the type of column. Columns that count items (eg. IRQ) show the sum across all CPUs in the system. Columns that show a percentage show the average across all CPUs in the system. Columns that dump raw MSR values simply show 0 in the summary. After the system summary row, each row describes a specific Package/Core/CPU. Note that if the --cpu parameter is used to limit which specific CPUs are displayed, turbostat will still collect statistics for all CPUs in the system and will still show the system summary for all CPUs in the system. .SH COLUMN DESCRIPTIONS -.nf +.PP \fBusec\fP For each CPU, the number of microseconds elapsed during counter collection, including thread migration -- if any. This counter is disabled by default, and is enabled with "--enable usec", or --debug. On the summary row, usec refers to the total elapsed time to collect the counters on all cpus. +.PP \fBTime_Of_Day_Seconds\fP For each CPU, the gettimeofday(2) value (seconds.subsec since Epoch) when the counters ending the measurement interval were collected. This column is disabled by default, and can be enabled with "--enable Time_Of_Day_Seconds" or "--debug". On the summary row, Time_Of_Day_Seconds refers to the timestamp following collection of counters on the last CPU. +.PP \fBCore\fP processor core number. Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading Technology (HT). +.PP \fBCPU\fP Linux CPU (logical processor) number. Yes, it is okay that on many systems the CPUs are not listed in numerical order -- for efficiency reasons, turbostat runs in topology order, so HT siblings appear together. +.PP \fBPackage\fP processor package number -- not present on systems with a single processor package. +.PP \fBAvg_MHz\fP number of cycles executed divided by time elapsed. Note that this includes idle-time when 0 instructions are executed. +.PP \fBBusy%\fP percent of the measurement interval that the CPU executes instructions, aka. % of time in "C0" state. +.PP \fBBzy_MHz\fP average clock rate while the CPU was not idle (ie. in "c0" state). +.PP \fBTSC_MHz\fP average MHz that the TSC ran during the entire interval. +.PP \fBIRQ\fP The number of interrupts serviced by that CPU during the measurement interval. The system total line is the sum of interrupts serviced across all CPUs. turbostat parses /proc/interrupts to generate this summary. +.PP \fBSMI\fP The number of System Management Interrupts serviced CPU during the measurement interval. While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs. +.PP \fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval. The system summary line shows the sum for all CPUs. These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name. While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system. +.PP \fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3.... The system summary is the average of all CPUs in the system. Note that these are software, reflecting what was requested. The hardware counters reflect what was actually achieved. +.PP \fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. These numbers are from hardware residency counters. +.PP \fBCoreTmp\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. +.PP \fBPkgTmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. +.PP \fBGFX%rc6\fP The percentage of time the GPU is in the "render C6" state, rc6, during the measurement interval. From /sys/class/drm/card0/power/rc6_residency_ms. +.PP \fBGFXMHz\fP Instantaneous snapshot of what sysfs presents at the end of the measurement interval. From /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz. +.PP \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. These numbers are from hardware residency counters. +.PP \fBPkgWatt\fP Watts consumed by the whole package. +.PP \fBCorWatt\fP Watts consumed by the core part of the package. +.PP \fBGFXWatt\fP Watts consumed by the Graphics part of the package -- available only on client processors. +.PP \fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. +.PP \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. Note that the system summary is the sum of the package throttling time, and thus may be higher than 100% on a multi-package system. Note that the meaning of this field is model specific. For example, some hardware increments this counter when RAPL responds to thermal limits, but does not increment this counter when RAPL responds to power limits. Comparing PkgWatt and PkgTmp to system limits is necessary. +.PP \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. -.fi +.PP +\fBUncMHz\fP uncore MHz, instantaneous sample. .SH TOO MUCH INFORMATION EXAMPLE By default, turbostat dumps all possible information -- a system configuration header, followed by columns for all counters. This is ideal for remote debugging, use the "--out" option to save everything to a text file, and get that file to the expert helping you debug. .PP When you are not interested in all that information, and there are several ways to see only what you want. First the "--quiet" option will skip the configuration information, and turbostat will show only the counter columns. Second, you can reduce the columns with the "--hide" and "--show" options. If you use the "--show" option, then turbostat will show only the columns you list. If you use the "--hide" option, turbostat will show all columns, except the ones you list. .PP -To find out what columns are available for --show and --hide, the "--list" option is available. For convenience, the special strings "sysfs" can be used to refer to all of the sysfs C-state counters at once: +To find out what columns are available for --show and --hide, the "--list" option is available. Usually, the CATEGORY names above are used to refer to groups of counters. Also, for convenience, the special string "sysfs" can be used to refer to all of the sysfs C-state counters at once: +.PP .nf sudo ./turbostat --show sysfs --quiet sleep 10 10.003837 sec @@ -158,32 +184,29 @@ Without a command to fork, turbostat displays statistics ever 5 seconds. Periodic output goes to stdout, by default, unless --out is used to specify an output file. The 5-second interval can be changed with the "-i sec" option. .nf -sudo ./turbostat --quiet --hide sysfs,IRQ,SMI,CoreTmp,PkgTmp,GFX%rc6,GFXMHz,PkgWatt,CorWatt,GFXWatt - Core CPU Avg_MHz Busy% Bzy_MHz TSC_MHz CPU%c1 CPU%c3 CPU%c6 CPU%c7 - - - 488 12.52 3900 3498 12.50 0.00 0.00 74.98 - 0 0 5 0.13 3900 3498 99.87 0.00 0.00 0.00 - 0 4 3897 99.99 3900 3498 0.01 - 1 1 0 0.00 3856 3498 0.01 0.00 0.00 99.98 - 1 5 0 0.00 3861 3498 0.01 - 2 2 1 0.02 3889 3498 0.03 0.00 0.00 99.95 - 2 6 0 0.00 3863 3498 0.05 - 3 3 0 0.01 3869 3498 0.02 0.00 0.00 99.97 - 3 7 0 0.00 3878 3498 0.03 - Core CPU Avg_MHz Busy% Bzy_MHz TSC_MHz CPU%c1 CPU%c3 CPU%c6 CPU%c7 - - - 491 12.59 3900 3498 12.42 0.00 0.00 74.99 - 0 0 27 0.69 3900 3498 99.31 0.00 0.00 0.00 - 0 4 3898 99.99 3900 3498 0.01 - 1 1 0 0.00 3883 3498 0.01 0.00 0.00 99.99 - 1 5 0 0.00 3898 3498 0.01 - 2 2 0 0.01 3889 3498 0.02 0.00 0.00 99.98 - 2 6 0 0.00 3889 3498 0.02 - 3 3 0 0.00 3856 3498 0.01 0.00 0.00 99.99 - 3 7 0 0.00 3897 3498 0.01 +sudo turbostat --quiet --show CPU,frequency + Core CPU Avg_MHz Busy% Bzy_MHz TSC_MHz CPU%c7 UncMhz + - - 524 12.48 4198 3096 74.53 3800 + 0 0 4 0.09 4081 3096 98.88 3800 + 0 4 1 0.02 4063 3096 + 1 1 2 0.06 4063 3096 99.60 + 1 5 2 0.05 4070 3096 + 2 2 4178 99.52 4199 3096 0.00 + 2 6 3 0.08 4159 3096 + 3 3 1 0.04 4046 3096 99.66 + 3 7 0 0.01 3989 3096 + Core CPU Avg_MHz Busy% Bzy_MHz TSC_MHz CPU%c7 UncMhz + - - 525 12.52 4198 3096 74.54 3800 + 0 0 4 0.10 4051 3096 99.49 3800 + 0 4 2 0.04 3993 3096 + 1 1 3 0.07 4054 3096 99.56 + 1 5 4 0.10 4018 3096 + 2 2 4178 99.51 4199 3096 0.00 + 2 6 4 0.09 4143 3096 + 3 3 2 0.06 4026 3096 99.10 + 3 7 7 0.17 4074 3096 .fi -This example also shows the use of the --hide option to skip columns that are not wanted. -Note that cpu4 in this example is 99.99% busy, while the other CPUs are all under 1% busy. -Notice that cpu4's HT sibling is cpu0, which is under 1% busy, but can get into CPU%c1 only, -because its cpu4's activity on shared hardware keeps it from entering a deeper C-state. +This example also shows the use of the --show option to show only the desired columns. .SH SYSTEM CONFIGURATION INFORMATION EXAMPLE @@ -191,61 +214,86 @@ By default, turbostat always dumps system configuration information before taking measurements. In the example above, "--quiet" is used to suppress that output. Here is an example of the configuration information: .nf -turbostat version 2017.02.15 - Len Brown <lenb@kernel.org> -CPUID(0): GenuineIntel 13 CPUID levels; family:model:stepping 0x6:3c:3 (6:60:3) -CPUID(1): SSE3 MONITOR - EIST TM2 TSC MSR ACPI-TM TM -CPUID(6): APERF, TURBO, DTS, PTM, No-HWP, No-HWPnotify, No-HWPwindow, No-HWPepp, No-HWPpkg, EPB -cpu4: MSR_IA32_MISC_ENABLE: 0x00850089 (TCC EIST No-MWAIT PREFETCH TURBO) -CPUID(7): No-SGX -cpu4: MSR_MISC_PWR_MGMT: 0x00400000 (ENable-EIST_Coordination DISable-EPB DISable-OOB) -RAPL: 3121 sec. Joule Counter Range, at 84 Watts -cpu4: MSR_PLATFORM_INFO: 0x80838f3012300 +turbostat version 2022.04.16 - Len Brown <lenb@kernel.org> +Kernel command line: BOOT_IMAGE=/boot/vmlinuz-5.18.0-rc6-00001-ge6891250e3b5 ... +CPUID(0): GenuineIntel 0x16 CPUID levels +CPUID(1): family:model:stepping 0x6:9e:9 (6:158:9) microcode 0xea +CPUID(0x80000000): max_extended_levels: 0x80000008 +CPUID(1): SSE3 MONITOR - EIST TM2 TSC MSR ACPI-TM HT TM +CPUID(6): APERF, TURBO, DTS, PTM, HWP, HWPnotify, HWPwindow, HWPepp, No-HWPpkg, EPB +cpu7: MSR_IA32_MISC_ENABLE: 0x00850089 (TCC EIST MWAIT PREFETCH TURBO) +CPUID(7): SGX +cpu7: MSR_IA32_FEATURE_CONTROL: 0x00000005 (Locked ) +CPUID(0x15): eax_crystal: 2 ebx_tsc: 258 ecx_crystal_hz: 0 +TSC: 3096 MHz (24000000 Hz * 258 / 2 / 1000000) +CPUID(0x16): base_mhz: 3100 max_mhz: 4200 bus_mhz: 100 +cpu7: MSR_MISC_PWR_MGMT: 0x00401cc0 (ENable-EIST_Coordination DISable-EPB DISable-OOB) +RAPL: 5825 sec. Joule Counter Range, at 45 Watts +cpu7: MSR_PLATFORM_INFO: 0x80839f1011f00 8 * 100.0 = 800.0 MHz max efficiency frequency -35 * 100.0 = 3500.0 MHz base frequency -cpu4: MSR_IA32_POWER_CTL: 0x0004005d (C1E auto-promotion: DISabled) -cpu4: MSR_TURBO_RATIO_LIMIT: 0x25262727 -37 * 100.0 = 3700.0 MHz max turbo 4 active cores -38 * 100.0 = 3800.0 MHz max turbo 3 active cores -39 * 100.0 = 3900.0 MHz max turbo 2 active cores -39 * 100.0 = 3900.0 MHz max turbo 1 active cores -cpu4: MSR_CONFIG_TDP_NOMINAL: 0x00000023 (base_ratio=35) -cpu4: MSR_CONFIG_TDP_LEVEL_1: 0x00000000 () -cpu4: MSR_CONFIG_TDP_LEVEL_2: 0x00000000 () -cpu4: MSR_CONFIG_TDP_CONTROL: 0x80000000 ( lock=1) -cpu4: MSR_TURBO_ACTIVATION_RATIO: 0x00000000 (MAX_NON_TURBO_RATIO=0 lock=0) -cpu4: MSR_PKG_CST_CONFIG_CONTROL: 0x1e000400 (UNdemote-C3, UNdemote-C1, demote-C3, demote-C1, UNlocked: pkg-cstate-limit=0: pc0) -cpu4: POLL: CPUIDLE CORE POLL IDLE -cpu4: C1: MWAIT 0x00 -cpu4: C1E: MWAIT 0x01 -cpu4: C3: MWAIT 0x10 -cpu4: C6: MWAIT 0x20 -cpu4: C7s: MWAIT 0x32 -cpu4: MSR_MISC_FEATURE_CONTROL: 0x00000000 (L2-Prefetch L2-Prefetch-pair L1-Prefetch L1-IP-Prefetch) -cpu0: MSR_IA32_ENERGY_PERF_BIAS: 0x00000006 (balanced) -cpu0: MSR_CORE_PERF_LIMIT_REASONS, 0x31200000 (Active: ) (Logged: Transitions, MultiCoreTurbo, Amps, Auto-HWP, ) -cpu0: MSR_GFX_PERF_LIMIT_REASONS, 0x00000000 (Active: ) (Logged: ) -cpu0: MSR_RING_PERF_LIMIT_REASONS, 0x0d000000 (Active: ) (Logged: Amps, PkgPwrL1, PkgPwrL2, ) +31 * 100.0 = 3100.0 MHz base frequency +cpu7: MSR_IA32_POWER_CTL: 0x002c005d (C1E auto-promotion: DISabled) +cpu7: MSR_TURBO_RATIO_LIMIT: 0x2728292a +39 * 100.0 = 3900.0 MHz max turbo 4 active cores +40 * 100.0 = 4000.0 MHz max turbo 3 active cores +41 * 100.0 = 4100.0 MHz max turbo 2 active cores +42 * 100.0 = 4200.0 MHz max turbo 1 active cores +cpu7: MSR_CONFIG_TDP_NOMINAL: 0x0000001f (base_ratio=31) +cpu7: MSR_CONFIG_TDP_LEVEL_1: 0x00000000 () +cpu7: MSR_CONFIG_TDP_LEVEL_2: 0x00000000 () +cpu7: MSR_CONFIG_TDP_CONTROL: 0x80000000 ( lock=1) +cpu7: MSR_TURBO_ACTIVATION_RATIO: 0x00000000 (MAX_NON_TURBO_RATIO=0 lock=0) +cpu7: MSR_PKG_CST_CONFIG_CONTROL: 0x1e008008 (UNdemote-C3, UNdemote-C1, demote-C3, demote-C1, locked, pkg-cstate-limit=8 (unlimited)) +Uncore Frequency pkg0 die0: 800 - 3900 MHz (800 - 3900 MHz) +/dev/cpu_dma_latency: 2000000000 usec (default) +current_driver: intel_idle +current_governor: menu +current_governor_ro: menu +cpu7: POLL: CPUIDLE CORE POLL IDLE +cpu7: C1: MWAIT 0x00 +cpu7: C1E: MWAIT 0x01 +cpu7: C3: MWAIT 0x10 +cpu7: C6: MWAIT 0x20 +cpu7: C7s: MWAIT 0x33 +cpu7: C8: MWAIT 0x40 +cpu7: C9: MWAIT 0x50 +cpu7: C10: MWAIT 0x60 +cpu7: cpufreq driver: intel_pstate +cpu7: cpufreq governor: performance +cpufreq intel_pstate no_turbo: 0 +cpu7: MSR_MISC_FEATURE_CONTROL: 0x00000000 (L2-Prefetch L2-Prefetch-pair L1-Prefetch L1-IP-Prefetch) +cpu0: MSR_PM_ENABLE: 0x00000001 (HWP) +cpu0: MSR_HWP_CAPABILITIES: 0x01101f53 (high 83 guar 31 eff 16 low 1) +cpu0: MSR_HWP_REQUEST: 0x00005353 (min 83 max 83 des 0 epp 0x0 window 0x0 pkg 0x0) +cpu0: MSR_HWP_INTERRUPT: 0x00000001 (EN_Guaranteed_Perf_Change, Dis_Excursion_Min) +cpu0: MSR_HWP_STATUS: 0x00000004 (No-Guaranteed_Perf_Change, No-Excursion_Min) +cpu0: EPB: 6 (balanced) cpu0: MSR_RAPL_POWER_UNIT: 0x000a0e03 (0.125000 Watts, 0.000061 Joules, 0.000977 sec.) -cpu0: MSR_PKG_POWER_INFO: 0x000002a0 (84 W TDP, RAPL 0 - 0 W, 0.000000 sec.) -cpu0: MSR_PKG_POWER_LIMIT: 0x428348001a82a0 (UNlocked) -cpu0: PKG Limit #1: ENabled (84.000000 Watts, 8.000000 sec, clamp DISabled) -cpu0: PKG Limit #2: ENabled (105.000000 Watts, 0.002441* sec, clamp DISabled) +cpu0: MSR_PKG_POWER_INFO: 0x00000168 (45 W TDP, RAPL 0 - 0 W, 0.000000 sec.) +cpu0: MSR_PKG_POWER_LIMIT: 0x42820800218208 (UNlocked) +cpu0: PKG Limit #1: ENabled (65.000 Watts, 64.000000 sec, clamp ENabled) +cpu0: PKG Limit #2: ENabled (65.000 Watts, 0.002441* sec, clamp DISabled) +cpu0: MSR_VR_CURRENT_CONFIG: 0x00000000 +cpu0: PKG Limit #4: 0.000000 Watts (UNlocked) +cpu0: MSR_DRAM_POWER_LIMIT: 0x5400de00000000 (UNlocked) +cpu0: DRAM Limit: DISabled (0.000 Watts, 0.000977 sec, clamp DISabled) cpu0: MSR_PP0_POLICY: 0 cpu0: MSR_PP0_POWER_LIMIT: 0x00000000 (UNlocked) -cpu0: Cores Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled) +cpu0: Cores Limit: DISabled (0.000 Watts, 0.000977 sec, clamp DISabled) cpu0: MSR_PP1_POLICY: 0 cpu0: MSR_PP1_POWER_LIMIT: 0x00000000 (UNlocked) -cpu0: GFX Limit: DISabled (0.000000 Watts, 0.000977 sec, clamp DISabled) -cpu0: MSR_IA32_TEMPERATURE_TARGET: 0x00641400 (100 C) -cpu0: MSR_IA32_PACKAGE_THERM_STATUS: 0x884c0800 (24 C) -cpu0: MSR_IA32_THERM_STATUS: 0x884c0000 (24 C +/- 1) -cpu1: MSR_IA32_THERM_STATUS: 0x88510000 (19 C +/- 1) -cpu2: MSR_IA32_THERM_STATUS: 0x884e0000 (22 C +/- 1) -cpu3: MSR_IA32_THERM_STATUS: 0x88510000 (19 C +/- 1) -cpu4: MSR_PKGC3_IRTL: 0x00008842 (valid, 67584 ns) -cpu4: MSR_PKGC6_IRTL: 0x00008873 (valid, 117760 ns) -cpu4: MSR_PKGC7_IRTL: 0x00008891 (valid, 148480 ns) +cpu0: GFX Limit: DISabled (0.000 Watts, 0.000977 sec, clamp DISabled) +cpu0: MSR_IA32_TEMPERATURE_TARGET: 0x00640000 (100 C) (100 default - 0 offset) +cpu0: MSR_IA32_PACKAGE_THERM_STATUS: 0x88200800 (68 C) +cpu0: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x00000003 (100 C, 100 C) +cpu7: MSR_PKGC3_IRTL: 0x0000884e (valid, 79872 ns) +cpu7: MSR_PKGC6_IRTL: 0x00008876 (valid, 120832 ns) +cpu7: MSR_PKGC7_IRTL: 0x00008894 (valid, 151552 ns) +cpu7: MSR_PKGC8_IRTL: 0x000088fa (valid, 256000 ns) +cpu7: MSR_PKGC9_IRTL: 0x0000894c (valid, 339968 ns) +cpu7: MSR_PKGC10_IRTL: 0x00008bf2 (valid, 1034240 ns) .fi +.PP The \fBmax efficiency\fP frequency, a.k.a. Low Frequency Mode, is the frequency available at the minimum package voltage. The \fBTSC frequency\fP is the base frequency of the processor -- this should match the brand string diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ede31a4287a0..831dc32d45fa 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -126,6 +126,7 @@ struct msr_counter bic[] = { { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, + { 0x0, "UncMHz", "", 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -183,10 +184,11 @@ struct msr_counter bic[] = { #define BIC_GFXACTMHz (1ULL << 51) #define BIC_IPC (1ULL << 52) #define BIC_CORE_THROT_CNT (1ULL << 53) +#define BIC_UNCORE_MHZ (1ULL << 54) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz ) +#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_UNCORE_MHZ) #define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) @@ -228,6 +230,7 @@ unsigned int do_slm_cstates; unsigned int use_c1_residency_msr; unsigned int has_aperf; unsigned int has_epb; +unsigned int is_hybrid; unsigned int do_irtl_snb; unsigned int do_irtl_hsw; unsigned int units = 1000000; /* MHz etc */ @@ -393,6 +396,7 @@ struct pkg_data { unsigned long long rapl_pkg_perf_status; /* MSR_PKG_PERF_STATUS */ unsigned long long rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; + unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_COUNTERS]; } *package_even, *package_odd; @@ -988,6 +992,9 @@ void print_header(char *delim) if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : "")); } + if (DO_BIC(BIC_UNCORE_MHZ)) + outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : "")); + for (mp = sys.pp; mp; mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 64) @@ -1370,6 +1377,9 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + /* UncMHz */ + if (DO_BIC(BIC_UNCORE_MHZ)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { @@ -1471,6 +1481,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) else old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; + old->uncore_mhz = new->uncore_mhz; old->gfx_mhz = new->gfx_mhz; old->gfx_act_mhz = new->gfx_act_mhz; @@ -1689,6 +1700,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->pkg_temp_c = 0; p->gfx_rc6_ms = 0; + p->uncore_mhz = 0; p->gfx_mhz = 0; p->gfx_act_mhz = 0; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) @@ -1788,6 +1800,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.energy_gfx += p->energy_gfx; average.packages.gfx_rc6_ms = p->gfx_rc6_ms; + average.packages.uncore_mhz = p->uncore_mhz; average.packages.gfx_mhz = p->gfx_mhz; average.packages.gfx_act_mhz = p->gfx_act_mhz; @@ -1948,6 +1961,16 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) return 0; } +unsigned long long get_uncore_mhz(int package, int die) +{ + char path[128]; + + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/current_freq_khz", package, + die); + + return (snapshot_sysfs_counter(path) / 1000); +} + int get_epb(int cpu) { char path[128 + PATH_BYTES]; @@ -2035,9 +2058,9 @@ int get_core_throt_cnt(int cpu, unsigned long long *cnt) if (!fp) return -1; ret = fscanf(fp, "%lld", &tmp); + fclose(fp); if (ret != 1) return -1; - fclose(fp); *cnt = tmp; return 0; @@ -2297,6 +2320,10 @@ retry: if (DO_BIC(BIC_GFX_rc6)) p->gfx_rc6_ms = gfx_cur_rc6_ms; + /* n.b. assume die0 uncore frequency applies to whole package */ + if (DO_BIC(BIC_UNCORE_MHZ)) + p->uncore_mhz = get_uncore_mhz(p->package_id, 0); + if (DO_BIC(BIC_GFXMHz)) p->gfx_mhz = gfx_cur_mhz; @@ -2494,6 +2521,7 @@ int has_turbo_ratio_group_limits(int family, int model) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_ATOM_GOLDMONT_D: case INTEL_FAM6_ATOM_TREMONT_D: return 1; @@ -2502,13 +2530,14 @@ int has_turbo_ratio_group_limits(int family, int model) } } -static void dump_turbo_ratio_limits(int family, int model) +static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model) { unsigned long long msr, core_counts; - unsigned int ratio, group_size; + int shift; - get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); + get_msr(base_cpu, trl_msr_offset, &msr); + fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n", + base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY" : "", msr); if (has_turbo_ratio_group_limits(family, model)) { get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts); @@ -2517,53 +2546,16 @@ static void dump_turbo_ratio_limits(int family, int model) core_counts = 0x0807060504030201; } - ratio = (msr >> 56) & 0xFF; - group_size = (core_counts >> 56) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); + for (shift = 56; shift >= 0; shift -= 8) { + unsigned int ratio, group_size; - ratio = (msr >> 48) & 0xFF; - group_size = (core_counts >> 48) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); - - ratio = (msr >> 40) & 0xFF; - group_size = (core_counts >> 40) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); - - ratio = (msr >> 32) & 0xFF; - group_size = (core_counts >> 32) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); - - ratio = (msr >> 24) & 0xFF; - group_size = (core_counts >> 24) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); - - ratio = (msr >> 16) & 0xFF; - group_size = (core_counts >> 16) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); - - ratio = (msr >> 8) & 0xFF; - group_size = (core_counts >> 8) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); + ratio = (msr >> shift) & 0xFF; + group_size = (core_counts >> shift) & 0xFF; + if (ratio) + fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", + ratio, bclk, ratio * bclk, group_size); + } - ratio = (msr >> 0) & 0xFF; - group_size = (core_counts >> 0) & 0xFF; - if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n", - ratio, bclk, ratio * bclk, group_size); return; } @@ -2976,7 +2968,7 @@ int get_thread_siblings(struct cpu_topology *thiscpu) } } } - } while (!strncmp(&character, ",", 1)); + } while (character == ','); fclose(filep); return CPU_COUNT_S(size, thiscpu->put_ids); @@ -3742,6 +3734,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) has_misc_feature_control = 1; break; case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ pkg_cstate_limits = skx_pkg_cstate_limits; has_misc_feature_control = 1; break; @@ -3871,6 +3864,22 @@ int is_icx(unsigned int family, unsigned int model) return 0; } +int is_spr(unsigned int family, unsigned int model) +{ + + if (!genuine_intel) + return 0; + + if (family != 6) + return 0; + + switch (model) { + case INTEL_FAM6_SAPPHIRERAPIDS_X: + return 1; + } + return 0; +} + int is_ehl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3988,6 +3997,7 @@ int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: return 1; default: return 0; @@ -4015,7 +4025,7 @@ int has_config_tdp(unsigned int family, unsigned int model) case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_ICELAKE_X: /* ICX */ - + case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ return 1; default: @@ -4083,8 +4093,12 @@ static void dump_cstate_pstate_config_info(unsigned int family, unsigned int mod if (has_ivt_turbo_ratio_limit(family, model)) dump_ivt_turbo_ratio_limits(); - if (has_turbo_ratio_limit(family, model)) - dump_turbo_ratio_limits(family, model); + if (has_turbo_ratio_limit(family, model)) { + dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT, family, model); + + if (is_hybrid) + dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT, family, model); + } if (has_atom_turbo_ratio_limit(family, model)) dump_atom_turbo_ratio_limits(); @@ -4098,6 +4112,24 @@ static void dump_cstate_pstate_config_info(unsigned int family, unsigned int mod dump_nhm_cst_cfg(); } +static int read_sysfs_int(char *path) +{ + FILE *input; + int retval = -1; + + input = fopen(path, "r"); + if (input == NULL) { + if (debug) + fprintf(outf, "NSFOD %s\n", path); + return (-1); + } + if (fscanf(input, "%d", &retval) != 1) + err(1, "%s: failed to read int from file", path); + fclose(input); + + return (retval); +} + static void dump_sysfs_file(char *path) { FILE *input; @@ -4116,6 +4148,48 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } +static void intel_uncore_frequency_probe(void) +{ + int i, j; + char path[128]; + + if (!genuine_intel) + return; + + if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) + return; + + if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) + BIC_PRESENT(BIC_UNCORE_MHZ); + + if (quiet) + return; + + for (i = 0; i < topo.num_packages; ++i) { + for (j = 0; j < topo.num_die; ++j) { + int k, l; + + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/min_freq_khz", + i, j); + k = read_sysfs_int(path); + sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/max_freq_khz", + i, j); + l = read_sysfs_int(path); + fprintf(outf, "Uncore Frequency pkg%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000); + + sprintf(path, + "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_min_freq_khz", + i, j); + k = read_sysfs_int(path); + sprintf(path, + "/sys/devices/system/cpu/intel_uncore_frequency/package_0%d_die_0%d/initial_max_freq_khz", + i, j); + l = read_sysfs_int(path); + fprintf(outf, "(%d - %d MHz)\n", k / 1000, l / 1000); + } + } +} + static void dump_sysfs_cstate_config(void) { char path[64]; @@ -4486,6 +4560,7 @@ static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ case INTEL_FAM6_ICELAKE_X: /* ICX */ + case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ return (rapl_dram_energy_units = 15.3 / 1000000); default: return (rapl_energy_units); @@ -4575,6 +4650,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_BROADWELL_X: /* BDX */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_ICELAKE_X: /* ICX */ + case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | @@ -4740,13 +4816,19 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model) void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { - if (is_skx(family, model) || is_bdx(family, model) || is_icx(family, model)) + if (family != 6) + return; + + switch (model) { + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: has_automatic_cstate_conversion = 1; + } } void prewake_cstate_probe(unsigned int family, unsigned int model) { - if (is_icx(family, model)) + if (is_icx(family, model) || is_spr(family, model)) dis_cstate_prewake = 1; } @@ -4975,6 +5057,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_ICELAKE_X: /* ICX */ + case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ @@ -5361,13 +5444,15 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_LAKEFIELD: case INTEL_FAM6_ALDERLAKE: case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: + case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: return INTEL_FAM6_CANNONLAKE_L; case INTEL_FAM6_ATOM_TREMONT_L: return INTEL_FAM6_ATOM_TREMONT; case INTEL_FAM6_ICELAKE_D: - case INTEL_FAM6_SAPPHIRERAPIDS_X: return INTEL_FAM6_ICELAKE_X; } return model; @@ -5398,7 +5483,7 @@ void print_dev_latency(void) } /* - * Linux-perf manages the the HW instructions-retired counter + * Linux-perf manages the HW instructions-retired counter * by enabling when requested, and hiding rollover */ void linux_perf_init(void) @@ -5543,7 +5628,10 @@ void process_cpuid() __cpuid_count(0x7, 0, eax, ebx, ecx, edx); has_sgx = ebx & (1 << 2); - fprintf(outf, "CPUID(7): %sSGX\n", has_sgx ? "" : "No-"); + + is_hybrid = edx & (1 << 15); + + fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-"); if (has_sgx) decode_feature_control_msr(); @@ -5654,7 +5742,7 @@ void process_cpuid() BIC_NOT_PRESENT(BIC_Pkgpc7); use_c1_residency_msr = 1; } - if (is_skx(family, model) || is_icx(family, model)) { + if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) { BIC_NOT_PRESENT(BIC_CPU_c3); BIC_NOT_PRESENT(BIC_Pkgpc3); BIC_NOT_PRESENT(BIC_CPU_c7); @@ -5699,6 +5787,7 @@ void process_cpuid() if (!quiet) dump_cstate_pstate_config_info(family, model); + intel_uncore_frequency_probe(); if (!quiet) print_dev_latency(); @@ -6128,7 +6217,30 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2022.04.16 - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2022.07.28 - Len Brown <lenb@kernel.org>\n"); +} + +#define COMMAND_LINE_SIZE 2048 + +void print_bootcmd(void) +{ + char bootcmd[COMMAND_LINE_SIZE]; + FILE *fp; + int ret; + + memset(bootcmd, 0, COMMAND_LINE_SIZE); + fp = fopen("/proc/cmdline", "r"); + if (!fp) + return; + + ret = fread(bootcmd, sizeof(char), COMMAND_LINE_SIZE - 1, fp); + if (ret) { + bootcmd[ret] = '\0'; + /* the last character is already '\n' */ + fprintf(outf, "Kernel command line: %s", bootcmd); + } + + fclose(fp); } int add_counter(unsigned int msr_num, char *path, char *name, @@ -6602,8 +6714,10 @@ int main(int argc, char **argv) outf = stderr; cmdline(argc, argv); - if (!quiet) + if (!quiet) { print_version(); + print_bootcmd(); + } probe_sysfs(); diff --git a/tools/spi/spidev_test.c b/tools/spi/spidev_test.c index 83844f8b862a..b0ca44c70e83 100644 --- a/tools/spi/spidev_test.c +++ b/tools/spi/spidev_test.c @@ -417,6 +417,7 @@ int main(int argc, char *argv[]) { int ret = 0; int fd; + uint32_t request; parse_opts(argc, argv); @@ -430,13 +431,23 @@ int main(int argc, char *argv[]) /* * spi mode */ + /* WR is make a request to assign 'mode' */ + request = mode; ret = ioctl(fd, SPI_IOC_WR_MODE32, &mode); if (ret == -1) pabort("can't set spi mode"); + /* RD is read what mode the device actually is in */ ret = ioctl(fd, SPI_IOC_RD_MODE32, &mode); if (ret == -1) pabort("can't get spi mode"); + /* Drivers can reject some mode bits without returning an error. + * Read the current value to identify what mode it is in, and if it + * differs from the requested mode, warn the user. + */ + if (request != mode) + printf("WARNING device does not support requested mode 0x%x\n", + request); /* * bits per word diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index d811cff73597..0a26c243e6e9 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -140,12 +140,12 @@ int use_after_invalid(void *ctx) bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(read_data), 0, &ptr); - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); bpf_ringbuf_submit_dynptr(&ptr, 0); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); return 0; } @@ -338,7 +338,7 @@ int invalid_helper2(void *ctx) get_map_val_dynptr(&ptr); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0); + bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0); return 0; } @@ -377,7 +377,7 @@ int invalid_write2(void *ctx) memcpy((void *)&ptr + 8, &x, sizeof(x)); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); bpf_ringbuf_submit_dynptr(&ptr, 0); @@ -473,7 +473,7 @@ int invalid_read2(void *ctx) get_map_val_dynptr(&ptr); /* this should fail */ - bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0); + bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 1, 0, 0); return 0; } diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index d67be48df4b2..a3a6103c8569 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -43,10 +43,10 @@ int test_read_write(void *ctx) bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr); /* Write data into the dynptr */ - err = err ?: bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data)); + err = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); /* Read the data that was written into the dynptr */ - err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0); + err = err ?: bpf_dynptr_read(read_data, sizeof(read_data), &ptr, 0, 0); /* Ensure the data we read matches the data we wrote */ for (i = 0; i < sizeof(read_data); i++) { diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 71b306602368..616ed4019655 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -3,6 +3,6 @@ TEST_PROGS := gpio-mockup.sh gpio-sim.sh TEST_FILES := gpio-mockup-sysfs.sh TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev gpio-chip-info gpio-line-name -CFLAGS += -O2 -g -Wall -I../../../../usr/include/ +CFLAGS += -O2 -g -Wall -I../../../../usr/include/ $(KHDR_INCLUDES) include ../lib.mk diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c index 4158da0da2bb..2237d1aac801 100644 --- a/tools/testing/selftests/kvm/rseq_test.c +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -82,8 +82,9 @@ static int next_cpu(int cpu) return cpu; } -static void *migration_worker(void *ign) +static void *migration_worker(void *__rseq_tid) { + pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; cpu_set_t allowed_mask; int r, i, cpu; @@ -106,7 +107,7 @@ static void *migration_worker(void *ign) * stable, i.e. while changing affinity is in-progress. */ smp_wmb(); - r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask); + r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", errno, strerror(errno)); smp_wmb(); @@ -231,7 +232,8 @@ int main(int argc, char *argv[]) vm = vm_create_default(VCPU_ID, 0, guest_code); ucall_init(vm, NULL); - pthread_create(&migration_thread, NULL, migration_worker, 0); + pthread_create(&migration_thread, NULL, migration_worker, + (void *)(unsigned long)gettid()); for (i = 0; !done; i++) { vcpu_run(vm, VCPU_ID); diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index a29f79618934..ffc35a22e914 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -36,4 +36,5 @@ test_unix_oob gro ioam6_parser toeplitz +tun cmsg_sender diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ddad703ace34..9a4b30bd3a9e 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -11,7 +11,7 @@ TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh -TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh +TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh fib_nexthop_nongw.sh TEST_PROGS += altnames.sh icmp.sh icmp_redirect.sh ip6_gre_headroom.sh TEST_PROGS += route_localnet.sh TEST_PROGS += reuseaddr_ports_exhausted.sh @@ -59,6 +59,7 @@ TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh +TEST_GEN_FILES += io_uring_zerocopy_tx TEST_FILES := settings diff --git a/tools/testing/selftests/net/fib_nexthop_nongw.sh b/tools/testing/selftests/net/fib_nexthop_nongw.sh new file mode 100755 index 000000000000..b7b928b38ce4 --- /dev/null +++ b/tools/testing/selftests/net/fib_nexthop_nongw.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# ns: h1 | ns: h2 +# 192.168.0.1/24 | +# eth0 | +# | 192.168.1.1/32 +# veth0 <---|---> veth1 +# Validate source address selection for route without gateway + +PAUSE_ON_FAIL=no +VERBOSE=0 +ret=0 + +################################################################################ +# helpers + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + [ "$VERBOSE" = "1" ] && echo +} + +run_cmd() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + out=$(eval $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" -a -n "$out" ]; then + echo "$out" + fi + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +################################################################################ +# config +setup() +{ + ip netns add h1 + ip -n h1 link set lo up + ip netns add h2 + ip -n h2 link set lo up + + # Add a fake eth0 to support an ip address + ip -n h1 link add name eth0 type dummy + ip -n h1 link set eth0 up + ip -n h1 address add 192.168.0.1/24 dev eth0 + + # Configure veths (same @mac, arp off) + ip -n h1 link add name veth0 type veth peer name veth1 netns h2 + ip -n h1 link set veth0 up + + ip -n h2 link set veth1 up + + # Configure @IP in the peer netns + ip -n h2 address add 192.168.1.1/32 dev veth1 + ip -n h2 route add default dev veth1 + + # Add a nexthop without @gw and use it in a route + ip -n h1 nexthop add id 1 dev veth0 + ip -n h1 route add 192.168.1.1 nhid 1 +} + +cleanup() +{ + ip netns del h1 2>/dev/null + ip netns del h2 2>/dev/null +} + +trap cleanup EXIT + +################################################################################ +# main + +while getopts :pv o +do + case $o in + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=1;; + esac +done + +cleanup +setup + +run_cmd ip -netns h1 route get 192.168.1.1 +log_test $? 0 "nexthop: get route with nexthop without gw" +run_cmd ip netns exec h1 ping -c1 192.168.1.1 +log_test $? 0 "nexthop: ping through nexthop without gw" + +exit $ret diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 8f481218a492..57b84e0c879e 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -37,6 +37,7 @@ TEST_PROGS = bridge_igmp.sh \ ipip_hier_gre_key.sh \ ipip_hier_gre_keys.sh \ ipip_hier_gre.sh \ + local_termination.sh \ loopback.sh \ mirror_gre_bound.sh \ mirror_gre_bridge_1d.sh \ @@ -52,6 +53,7 @@ TEST_PROGS = bridge_igmp.sh \ mirror_gre_vlan_bridge_1q.sh \ mirror_gre_vlan.sh \ mirror_vlan.sh \ + no_forwarding.sh \ pedit_dsfield.sh \ pedit_ip.sh \ pedit_l4port.sh \ diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.c b/tools/testing/selftests/net/io_uring_zerocopy_tx.c new file mode 100644 index 000000000000..9d64c560a2d6 --- /dev/null +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.c @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: MIT */ +/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */ +#include <assert.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <limits.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <linux/errqueue.h> +#include <linux/if_packet.h> +#include <linux/io_uring.h> +#include <linux/ipv6.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> + +#define NOTIF_TAG 0xfffffffULL +#define NONZC_TAG 0 +#define ZC_TAG 1 + +enum { + MODE_NONZC = 0, + MODE_ZC = 1, + MODE_ZC_FIXED = 2, + MODE_MIXED = 3, +}; + +static bool cfg_flush = false; +static bool cfg_cork = false; +static int cfg_mode = MODE_ZC_FIXED; +static int cfg_nr_reqs = 8; +static int cfg_family = PF_UNSPEC; +static int cfg_payload_len; +static int cfg_port = 8000; +static int cfg_runtime_ms = 4200; + +static socklen_t cfg_alen; +static struct sockaddr_storage cfg_dst_addr; + +static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); + +struct io_sq_ring { + unsigned *head; + unsigned *tail; + unsigned *ring_mask; + unsigned *ring_entries; + unsigned *flags; + unsigned *array; +}; + +struct io_cq_ring { + unsigned *head; + unsigned *tail; + unsigned *ring_mask; + unsigned *ring_entries; + struct io_uring_cqe *cqes; +}; + +struct io_uring_sq { + unsigned *khead; + unsigned *ktail; + unsigned *kring_mask; + unsigned *kring_entries; + unsigned *kflags; + unsigned *kdropped; + unsigned *array; + struct io_uring_sqe *sqes; + + unsigned sqe_head; + unsigned sqe_tail; + + size_t ring_sz; +}; + +struct io_uring_cq { + unsigned *khead; + unsigned *ktail; + unsigned *kring_mask; + unsigned *kring_entries; + unsigned *koverflow; + struct io_uring_cqe *cqes; + + size_t ring_sz; +}; + +struct io_uring { + struct io_uring_sq sq; + struct io_uring_cq cq; + int ring_fd; +}; + +#ifdef __alpha__ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 535 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 536 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 537 +# endif +#else /* !__alpha__ */ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 425 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 426 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 427 +# endif +#endif + +#if defined(__x86_64) || defined(__i386__) +#define read_barrier() __asm__ __volatile__("":::"memory") +#define write_barrier() __asm__ __volatile__("":::"memory") +#else + +#define read_barrier() __sync_synchronize() +#define write_barrier() __sync_synchronize() +#endif + +static int io_uring_setup(unsigned int entries, struct io_uring_params *p) +{ + return syscall(__NR_io_uring_setup, entries, p); +} + +static int io_uring_enter(int fd, unsigned int to_submit, + unsigned int min_complete, + unsigned int flags, sigset_t *sig) +{ + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, + flags, sig, _NSIG / 8); +} + +static int io_uring_register_buffers(struct io_uring *ring, + const struct iovec *iovecs, + unsigned nr_iovecs) +{ + int ret; + + ret = syscall(__NR_io_uring_register, ring->ring_fd, + IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); + return (ret < 0) ? -errno : ret; +} + +static int io_uring_register_notifications(struct io_uring *ring, + unsigned nr, + struct io_uring_notification_slot *slots) +{ + int ret; + struct io_uring_notification_register r = { + .nr_slots = nr, + .data = (unsigned long)slots, + }; + + ret = syscall(__NR_io_uring_register, ring->ring_fd, + IORING_REGISTER_NOTIFIERS, &r, sizeof(r)); + return (ret < 0) ? -errno : ret; +} + +static int io_uring_mmap(int fd, struct io_uring_params *p, + struct io_uring_sq *sq, struct io_uring_cq *cq) +{ + size_t size; + void *ptr; + int ret; + + sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); + ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ptr == MAP_FAILED) + return -errno; + sq->khead = ptr + p->sq_off.head; + sq->ktail = ptr + p->sq_off.tail; + sq->kring_mask = ptr + p->sq_off.ring_mask; + sq->kring_entries = ptr + p->sq_off.ring_entries; + sq->kflags = ptr + p->sq_off.flags; + sq->kdropped = ptr + p->sq_off.dropped; + sq->array = ptr + p->sq_off.array; + + size = p->sq_entries * sizeof(struct io_uring_sqe); + sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sq->sqes == MAP_FAILED) { + ret = -errno; +err: + munmap(sq->khead, sq->ring_sz); + return ret; + } + + cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); + ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); + if (ptr == MAP_FAILED) { + ret = -errno; + munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); + goto err; + } + cq->khead = ptr + p->cq_off.head; + cq->ktail = ptr + p->cq_off.tail; + cq->kring_mask = ptr + p->cq_off.ring_mask; + cq->kring_entries = ptr + p->cq_off.ring_entries; + cq->koverflow = ptr + p->cq_off.overflow; + cq->cqes = ptr + p->cq_off.cqes; + return 0; +} + +static int io_uring_queue_init(unsigned entries, struct io_uring *ring, + unsigned flags) +{ + struct io_uring_params p; + int fd, ret; + + memset(ring, 0, sizeof(*ring)); + memset(&p, 0, sizeof(p)); + p.flags = flags; + + fd = io_uring_setup(entries, &p); + if (fd < 0) + return fd; + ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); + if (!ret) + ring->ring_fd = fd; + else + close(fd); + return ret; +} + +static int io_uring_submit(struct io_uring *ring) +{ + struct io_uring_sq *sq = &ring->sq; + const unsigned mask = *sq->kring_mask; + unsigned ktail, submitted, to_submit; + int ret; + + read_barrier(); + if (*sq->khead != *sq->ktail) { + submitted = *sq->kring_entries; + goto submit; + } + if (sq->sqe_head == sq->sqe_tail) + return 0; + + ktail = *sq->ktail; + to_submit = sq->sqe_tail - sq->sqe_head; + for (submitted = 0; submitted < to_submit; submitted++) { + read_barrier(); + sq->array[ktail++ & mask] = sq->sqe_head++ & mask; + } + if (!submitted) + return 0; + + if (*sq->ktail != ktail) { + write_barrier(); + *sq->ktail = ktail; + write_barrier(); + } +submit: + ret = io_uring_enter(ring->ring_fd, submitted, 0, + IORING_ENTER_GETEVENTS, NULL); + return ret < 0 ? -errno : ret; +} + +static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, + const void *buf, size_t len, int flags) +{ + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = (__u8) IORING_OP_SEND; + sqe->fd = sockfd; + sqe->addr = (unsigned long) buf; + sqe->len = len; + sqe->msg_flags = (__u32) flags; +} + +static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, + const void *buf, size_t len, int flags, + unsigned slot_idx, unsigned zc_flags) +{ + io_uring_prep_send(sqe, sockfd, buf, len, flags); + sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF; + sqe->notification_idx = slot_idx; + sqe->ioprio = zc_flags; +} + +static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) +{ + struct io_uring_sq *sq = &ring->sq; + + if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries) + return NULL; + return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask]; +} + +static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) +{ + struct io_uring_cq *cq = &ring->cq; + const unsigned mask = *cq->kring_mask; + unsigned head = *cq->khead; + int ret; + + *cqe_ptr = NULL; + do { + read_barrier(); + if (head != *cq->ktail) { + *cqe_ptr = &cq->cqes[head & mask]; + break; + } + ret = io_uring_enter(ring->ring_fd, 0, 1, + IORING_ENTER_GETEVENTS, NULL); + if (ret < 0) + return -errno; + } while (1); + + return 0; +} + +static inline void io_uring_cqe_seen(struct io_uring *ring) +{ + *(&ring->cq)->khead += 1; + write_barrier(); +} + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static void do_setsockopt(int fd, int level, int optname, int val) +{ + if (setsockopt(fd, level, optname, &val, sizeof(val))) + error(1, errno, "setsockopt %d.%d: %d", level, optname, val); +} + +static int do_setup_tx(int domain, int type, int protocol) +{ + int fd; + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket t"); + + do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); + + if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) + error(1, errno, "connect"); + return fd; +} + +static void do_tx(int domain, int type, int protocol) +{ + struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}}; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + unsigned long packets = 0, bytes = 0; + struct io_uring ring; + struct iovec iov; + uint64_t tstop; + int i, fd, ret; + int compl_cqes = 0; + + fd = do_setup_tx(domain, type, protocol); + + ret = io_uring_queue_init(512, &ring, 0); + if (ret) + error(1, ret, "io_uring: queue init"); + + ret = io_uring_register_notifications(&ring, 1, b); + if (ret) + error(1, ret, "io_uring: tx ctx registration"); + + iov.iov_base = payload; + iov.iov_len = cfg_payload_len; + + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) + error(1, ret, "io_uring: buffer registration"); + + tstop = gettimeofday_ms() + cfg_runtime_ms; + do { + if (cfg_cork) + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); + + for (i = 0; i < cfg_nr_reqs; i++) { + unsigned zc_flags = 0; + unsigned buf_idx = 0; + unsigned slot_idx = 0; + unsigned mode = cfg_mode; + unsigned msg_flags = 0; + + if (cfg_mode == MODE_MIXED) + mode = rand() % 3; + + sqe = io_uring_get_sqe(&ring); + + if (mode == MODE_NONZC) { + io_uring_prep_send(sqe, fd, payload, + cfg_payload_len, msg_flags); + sqe->user_data = NONZC_TAG; + } else { + if (cfg_flush) { + zc_flags |= IORING_RECVSEND_NOTIF_FLUSH; + compl_cqes++; + } + io_uring_prep_sendzc(sqe, fd, payload, + cfg_payload_len, + msg_flags, slot_idx, zc_flags); + if (mode == MODE_ZC_FIXED) { + sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; + sqe->buf_index = buf_idx; + } + sqe->user_data = ZC_TAG; + } + } + + ret = io_uring_submit(&ring); + if (ret != cfg_nr_reqs) + error(1, ret, "submit"); + + for (i = 0; i < cfg_nr_reqs; i++) { + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret) + error(1, ret, "wait cqe"); + + if (cqe->user_data == NOTIF_TAG) { + compl_cqes--; + i--; + } else if (cqe->user_data != NONZC_TAG && + cqe->user_data != ZC_TAG) { + error(1, cqe->res, "invalid user_data"); + } else if (cqe->res <= 0 && cqe->res != -EAGAIN) { + error(1, cqe->res, "send failed"); + } else { + if (cqe->res > 0) { + packets++; + bytes += cqe->res; + } + /* failed requests don't flush */ + if (cfg_flush && + cqe->res <= 0 && + cqe->user_data == ZC_TAG) + compl_cqes--; + } + io_uring_cqe_seen(&ring); + } + if (cfg_cork) + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); + } while (gettimeofday_ms() < tstop); + + if (close(fd)) + error(1, errno, "close"); + + fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", + packets, bytes >> 20, + packets / (cfg_runtime_ms / 1000), + (bytes >> 20) / (cfg_runtime_ms / 1000)); + + while (compl_cqes) { + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret) + error(1, ret, "wait cqe"); + io_uring_cqe_seen(&ring); + compl_cqes--; + } +} + +static void do_test(int domain, int type, int protocol) +{ + int i; + + for (i = 0; i < IP_MAXPACKET; i++) + payload[i] = 'a' + (i % 26); + do_tx(domain, type, protocol); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] " + "(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + const int max_payload_len = sizeof(payload) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) - + 40 /* max tcp options */; + struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr; + struct sockaddr_in *addr4 = (void *) &cfg_dst_addr; + char *daddr = NULL; + int c; + + if (argc <= 1) + usage(argv[0]); + cfg_payload_len = max_payload_len; + + while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) { + switch (c) { + case '4': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + break; + case '6': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + break; + case 'D': + daddr = optarg; + break; + case 'p': + cfg_port = strtoul(optarg, NULL, 0); + break; + case 's': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 't': + cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; + break; + case 'n': + cfg_nr_reqs = strtoul(optarg, NULL, 0); + break; + case 'f': + cfg_flush = 1; + break; + case 'c': + cfg_cork = strtol(optarg, NULL, 0); + break; + case 'm': + cfg_mode = strtol(optarg, NULL, 0); + break; + } + } + + switch (cfg_family) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (daddr && + inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", daddr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (daddr && + inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", daddr); + break; + default: + error(1, 0, "illegal domain"); + } + + if (cfg_payload_len > max_payload_len) + error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); + if (cfg_mode == MODE_NONZC && cfg_flush) + error(1, 0, "-f: only zerocopy modes support notifications"); + if (optind != argc - 1) + usage(argv[0]); +} + +int main(int argc, char **argv) +{ + const char *cfg_test = argv[argc - 1]; + + parse_opts(argc, argv); + + if (!strcmp(cfg_test, "tcp")) + do_test(cfg_family, SOCK_STREAM, 0); + else if (!strcmp(cfg_test, "udp")) + do_test(cfg_family, SOCK_DGRAM, 0); + else + error(1, 0, "unknown cfg_test %s", cfg_test); + return 0; +} diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.sh b/tools/testing/selftests/net/io_uring_zerocopy_tx.sh new file mode 100755 index 000000000000..6a65e4437640 --- /dev/null +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# +# Send data between two processes across namespaces +# Run twice: once without and once with zerocopy + +set -e + +readonly DEV="veth0" +readonly DEV_MTU=65535 +readonly BIN_TX="./io_uring_zerocopy_tx" +readonly BIN_RX="./msg_zerocopy" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +readonly path_sysctl_mem="net.core.optmem_max" + +# No arguments: automated test +if [[ "$#" -eq "0" ]]; then + IPs=( "4" "6" ) + protocols=( "tcp" "udp" ) + + for IP in "${IPs[@]}"; do + for proto in "${protocols[@]}"; do + for mode in $(seq 1 3); do + $0 "$IP" "$proto" -m "$mode" -t 1 -n 32 + $0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -f + $0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -c -f + done + done + done + + echo "OK. All tests passed" + exit 0 +fi + +# Argument parsing +if [[ "$#" -lt "2" ]]; then + echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] <args>" + exit 1 +fi + +readonly IP="$1" +shift +readonly TXMODE="$1" +shift +readonly EXTRA_ARGS="$@" + +# Argument parsing: configure addresses +if [[ "${IP}" == "4" ]]; then + readonly SADDR="${SADDR4}" + readonly DADDR="${DADDR4}" +elif [[ "${IP}" == "6" ]]; then + readonly SADDR="${SADDR6}" + readonly DADDR="${DADDR6}" +else + echo "Invalid IP version ${IP}" + exit 1 +fi + +# Argument parsing: select receive mode +# +# This differs from send mode for +# - packet: use raw recv, because packet receives skb clones +# - raw_hdrinc: use raw recv, because hdrincl is a tx-only option +case "${TXMODE}" in +'packet' | 'packet_dgram' | 'raw_hdrincl') + RXMODE='raw' + ;; +*) + RXMODE="${TXMODE}" + ;; +esac + +# Start of state changes: install cleanup handler +save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})" + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" + sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}" +} + +trap cleanup EXIT + +# Configure system settings +sysctl -w -q "${path_sysctl_mem}=1000000" + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ + peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +# Optionally disable sg or csum offload to test edge cases +# ip netns exec "${NS1}" ethtool -K "${DEV}" sg off + +do_test() { + local readonly ARGS="$1" + + echo "ipv${IP} ${TXMODE} ${ARGS}" + ip netns exec "${NS2}" "${BIN_RX}" "-${IP}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" -r "${RXMODE}" & + sleep 0.2 + ip netns exec "${NS1}" "${BIN_TX}" "-${IP}" -t 1 -D "${DADDR}" ${ARGS} "${TXMODE}" + wait +} + +do_test "${EXTRA_ARGS}" +echo ok diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index f905d5358e68..48a99e1453e1 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -6,7 +6,7 @@ KSFT_KHDR_INSTALL := 1 CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INCLUDES) TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ - simult_flows.sh mptcp_sockopt.sh + simult_flows.sh mptcp_sockopt.sh userspace_pm.sh TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h index b86642f90d7f..3a391c9bf468 100644 --- a/tools/testing/selftests/rseq/rseq-riscv.h +++ b/tools/testing/selftests/rseq/rseq-riscv.h @@ -86,7 +86,7 @@ do { \ #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \ RSEQ_INJECT_ASM(1) \ - "la "RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \ + "la " RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \ REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(rseq_cs) "]\n" \ __rseq_str(label) ":\n" @@ -103,17 +103,17 @@ do { \ #define RSEQ_ASM_OP_CMPEQ(var, expect, label) \ REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ - "bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ + "bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ __rseq_str(label) "\n" #define RSEQ_ASM_OP_CMPEQ32(var, expect, label) \ - "lw "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ - "bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ + "lw " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ + "bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ __rseq_str(label) "\n" #define RSEQ_ASM_OP_CMPNE(var, expect, label) \ REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \ - "beq "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ + "beq " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \ __rseq_str(label) "\n" #define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \ @@ -127,12 +127,12 @@ do { \ REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" #define RSEQ_ASM_OP_R_LOAD_OFF(offset) \ - "add "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \ + "add " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \ RSEQ_ASM_TMP_REG_1 "\n" \ REG_L RSEQ_ASM_TMP_REG_1 ", (" RSEQ_ASM_TMP_REG_1 ")\n" #define RSEQ_ASM_OP_R_ADD(count) \ - "add "RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \ + "add " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \ ", %[" __rseq_str(count) "]\n" #define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label) \ @@ -194,8 +194,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [expect] "r" (expect), [newv] "r" (newv) @@ -251,8 +251,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [expectnot] "r" (expectnot), [load] "m" (*load), @@ -301,8 +301,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [count] "r" (count) RSEQ_INJECT_INPUT @@ -352,8 +352,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -411,8 +411,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -472,8 +472,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [v] "m" (*v), [expect] "r" (expect), [v2] "m" (*v2), @@ -532,8 +532,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -593,8 +593,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect, RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [expect] "r" (expect), [v] "m" (*v), [newv] "r" (newv), @@ -651,8 +651,8 @@ int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (__rseq_abi.cpu_id), - [rseq_cs] "m" (__rseq_abi.rseq_cs), + [current_cpu_id] "m" (rseq_get_abi()->cpu_id), + [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [ptr] "r" (ptr), [off] "er" (off), [inc] "er" (inc) diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 986b9458efb2..4177f9507bbe 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -111,7 +111,8 @@ void rseq_init(void) libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); - if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p) { + if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p && + *libc_rseq_size_p != 0) { /* rseq registration owned by glibc */ rseq_offset = *libc_rseq_offset_p; rseq_size = *libc_rseq_size_p; diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 136df5b76319..4ae6c8991307 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -809,7 +809,7 @@ void kill_thread_or_group(struct __test_metadata *_metadata, .len = (unsigned short)ARRAY_SIZE(filter_thread), .filter = filter_thread, }; - int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA; + int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA; struct sock_filter filter_process[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index 3a5936cc10ab..f0d51d4d2c87 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -1,4 +1,4 @@ -TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex +TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex vfork_exec TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread diff --git a/tools/testing/selftests/timens/vfork_exec.c b/tools/testing/selftests/timens/vfork_exec.c new file mode 100644 index 000000000000..e6ccd900f30a --- /dev/null +++ b/tools/testing/selftests/timens/vfork_exec.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <stdbool.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <time.h> +#include <unistd.h> +#include <string.h> + +#include "log.h" +#include "timens.h" + +#define OFFSET (36000) + +int main(int argc, char *argv[]) +{ + struct timespec now, tst; + int status, i; + pid_t pid; + + if (argc > 1) { + if (sscanf(argv[1], "%ld", &now.tv_sec) != 1) + return pr_perror("sscanf"); + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec); + } + return 0; + } + + nscheck(); + + ksft_set_plan(1); + + clock_gettime(CLOCK_MONOTONIC, &now); + + if (unshare_timens()) + return 1; + + if (_settime(CLOCK_MONOTONIC, OFFSET)) + return 1; + + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + pid = vfork(); + if (pid < 0) + return pr_perror("fork"); + + if (pid == 0) { + char now_str[64]; + char *cargv[] = {"exec", now_str, NULL}; + char *cenv[] = {NULL}; + + // Check that we are still in the source timens. + for (i = 0; i < 2; i++) { + _gettime(CLOCK_MONOTONIC, &tst, i); + if (abs(tst.tv_sec - now.tv_sec) > 5) + return pr_fail("%ld %ld\n", + now.tv_sec, tst.tv_sec); + } + + /* Check for proper vvar offsets after execve. */ + snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET); + execve("/proc/self/exe", cargv, cenv); + return pr_perror("execve"); + } + + if (waitpid(pid, &status, 0) != pid) + return pr_perror("waitpid"); + + if (status) + ksft_exit_fail(); + + ksft_test_result_pass("exec\n"); + ksft_exit_pass(); + return 0; +} diff --git a/tools/thermal/tmon/pid.c b/tools/thermal/tmon/pid.c index 296f69c00c57..da20088285bd 100644 --- a/tools/thermal/tmon/pid.c +++ b/tools/thermal/tmon/pid.c @@ -27,7 +27,7 @@ /************************************************************************** * PID (Proportional-Integral-Derivative) controller is commonly used in - * linear control system, consider the the process. + * linear control system, consider the process. * G(s) = U(s)/E(s) * kp = proportional gain * ki = integral gain diff --git a/tools/thermal/tmon/tmon.h b/tools/thermal/tmon/tmon.h index c9066ec104dd..44d16d778f04 100644 --- a/tools/thermal/tmon/tmon.h +++ b/tools/thermal/tmon/tmon.h @@ -27,6 +27,9 @@ #define NR_LINES_TZDATA 1 #define TMON_LOG_FILE "/var/tmp/tmon.log" +#include <sys/time.h> +#include <pthread.h> + extern unsigned long ticktime; extern double time_elapsed; extern unsigned long target_temp_user; diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 9b68658b6bb8..5b98f3ee58a5 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -233,6 +233,24 @@ static unsigned long read_slab_obj(struct slabinfo *s, const char *name) return l; } +static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) +{ + char x[128]; + FILE *f; + size_t l; + + snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} /* * Put a size string together @@ -409,14 +427,18 @@ static void show_tracking(struct slabinfo *s) { printf("\n%s: Kernel object allocation\n", s->name); printf("-----------------------------------------------------------------------\n"); - if (read_slab_obj(s, "alloc_calls")) + if (read_debug_slab_obj(s, "alloc_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "alloc_calls")) printf("%s", buffer); else printf("No Data\n"); printf("\n%s: Kernel object freeing\n", s->name); printf("------------------------------------------------------------------------\n"); - if (read_slab_obj(s, "free_calls")) + if (read_debug_slab_obj(s, "free_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "free_calls")) printf("%s", buffer); else printf("No Data\n"); |