From 86dd6c04ef9f213e14d60c9f64bce1cc019f816e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 8 Mar 2024 12:18:08 +0100 Subject: sched/balancing: Rename scheduler_tick() => sched_tick() - Standardize on prefixing scheduler-internal functions defined in with sched_*() prefix. scheduler_tick() was the only function using the scheduler_ prefix. Harmonize it. - The other reason to rename it is the NOHZ scheduler tick handling functions are already named sched_tick_*(). Make the 'git grep sched_tick' more meaningful. Signed-off-by: Ingo Molnar Acked-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20240308111819.1101550-3-mingo@kernel.org --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 17cb0761ff65..7eb7f31af796 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -301,7 +301,7 @@ enum { TASK_COMM_LEN = 16, }; -extern void scheduler_tick(void); +extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX -- cgit v1.2.3-59-g8ed1b From 22d5607400c62c72da9b60e3324744be83e147a4 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 24 Mar 2024 00:45:50 +0000 Subject: sched/fair: Check if a task has a fitting CPU when updating misfit If a misfit task is affined to a subset of the possible CPUs, we need to verify that one of these CPUs can fit it. Otherwise the load balancer code will continuously trigger needlessly leading the balance_interval to increase in return and eventually end up with a situation where real imbalances take a long time to address because of this impossible imbalance situation. This can happen in Android world where it's common for background tasks to be restricted to little cores. Similarly if we can't fit the biggest core, triggering misfit is pointless as it is the best we can ever get on this system. To be able to detect that; we use asym_cap_list to iterate through capacities in the system to see if the task is able to run at a higher capacity level based on its p->cpus_ptr. We do that when the affinity change, a fair task is forked, or when a task switched to fair policy. We store the max_allowed_capacity in task_struct to allow for cheap comparison in the fast path. Improve check_misfit_status() function by removing redundant checks. misfit_task_load will be 0 if the task can't move to a bigger CPU. And nohz_balancer_kick() already checks for cpu_check_capacity() before calling check_misfit_status(). Test: ===== Add trace_printk("balance_interval = %lu\n", interval) in get_sd_balance_interval(). run if [ "$MASK" != "0" ]; then adb shell "taskset -a $MASK cat /dev/zero > /dev/null" fi sleep 10 // parse ftrace buffer counting the occurrence of each valaue Where MASK is either: * 0: no busy task running * 1: busy task is pinned to 1 cpu; handled today to not cause misfit * f: busy task pinned to little cores, simulates busy background task, demonstrates the problem to be fixed Results: ======== Note how occurrence of balance_interval = 128 overshoots for MASK = f. BEFORE ------ MASK=0 1 balance_interval = 175 120 balance_interval = 128 846 balance_interval = 64 55 balance_interval = 63 215 balance_interval = 32 2 balance_interval = 31 2 balance_interval = 16 4 balance_interval = 8 1870 balance_interval = 4 65 balance_interval = 2 MASK=1 27 balance_interval = 175 37 balance_interval = 127 840 balance_interval = 64 167 balance_interval = 63 449 balance_interval = 32 84 balance_interval = 31 304 balance_interval = 16 1156 balance_interval = 8 2781 balance_interval = 4 428 balance_interval = 2 MASK=f 1 balance_interval = 175 1328 balance_interval = 128 44 balance_interval = 64 101 balance_interval = 63 25 balance_interval = 32 5 balance_interval = 31 23 balance_interval = 16 23 balance_interval = 8 4306 balance_interval = 4 177 balance_interval = 2 AFTER ----- Note how the high values almost disappear for all MASK values. The system has background tasks that could trigger the problem without simulate it even with MASK=0. MASK=0 103 balance_interval = 63 19 balance_interval = 31 194 balance_interval = 8 4827 balance_interval = 4 179 balance_interval = 2 MASK=1 131 balance_interval = 63 1 balance_interval = 31 87 balance_interval = 8 3600 balance_interval = 4 7 balance_interval = 2 MASK=f 8 balance_interval = 127 182 balance_interval = 63 3 balance_interval = 31 9 balance_interval = 16 415 balance_interval = 8 3415 balance_interval = 4 21 balance_interval = 2 Signed-off-by: Qais Yousef Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20240324004552.999936-3-qyousef@layalina.io --- include/linux/sched.h | 1 + init/init_task.c | 1 + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 16 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ed40e9f6155..c75fd46506df 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct task_struct { #endif unsigned int policy; + unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; diff --git a/init/init_task.c b/init/init_task.c index 4daee6d761c8..2558b719e053 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .cpus_ptr = &init_task.cpus_mask, .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, + .max_allowed_capacity = SCHED_CAPACITY_SCALE, .nr_cpus_allowed= NR_CPUS, .mm = NULL, .active_mm = &init_mm, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8270e2e15cb..c47c4f2e28f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5098,15 +5098,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + int cpu = cpu_of(rq); + if (!sched_asym_cpucap_active()) return; - if (!p || p->nr_cpus_allowed == 1) { - rq->misfit_task_load = 0; - return; - } + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { - if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -8253,6 +8257,36 @@ static void task_dead_fair(struct task_struct *p) remove_entity_load_avg(&p->se); } +/* + * Set the max capacity the task is allowed to run at for misfit detection. + */ +static void set_task_max_allowed_capacity(struct task_struct *p) +{ + struct asym_cap_data *entry; + + if (!sched_asym_cpucap_active()) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; + + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); +} + +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) +{ + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} + static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -8261,6 +8295,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) @@ -9610,16 +9646,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -11923,7 +11953,7 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { + if (check_misfit_status(rq)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12648,6 +12678,8 @@ static void task_fork_fair(struct task_struct *p) rq_lock(rq, &rf); update_rq_clock(rq); + set_task_max_allowed_capacity(p); + cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; if (curr) @@ -12771,6 +12803,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) { attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so @@ -13142,7 +13176,7 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, -- cgit v1.2.3-59-g8ed1b From 22d407b164ff79de42d21f37d99f9ee7abdd51c8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 21 Mar 2024 09:36:35 -0700 Subject: lib: add allocation tagging support for memory allocation profiling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily instrument memory allocators. It registers an "alloc_tags" codetag type with /proc/allocinfo interface to output allocation tag information when the feature is enabled. CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory allocation profiling instrumentation. Memory allocation profiling can be enabled or disabled at runtime using /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n. CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation profiling by default. [surenb@google.com: Documentation/filesystems/proc.rst: fix allocinfo title] Link: https://lkml.kernel.org/r/20240326073813.727090-1-surenb@google.com [surenb@google.com: do limited memory accounting for modules with ARCH_NEEDS_WEAK_PER_CPU] Link: https://lkml.kernel.org/r/20240402180933.1663992-2-surenb@google.com [klarasmodin@gmail.com: explicitly include irqflags.h in alloc_tag.h] Link: https://lkml.kernel.org/r/20240407133252.173636-1-klarasmodin@gmail.com [surenb@google.com: fix alloc_tag_init() to prevent passing NULL to PTR_ERR()] Link: https://lkml.kernel.org/r/20240417003349.2520094-1-surenb@google.com Link: https://lkml.kernel.org/r/20240321163705.3067592-14-surenb@google.com Signed-off-by: Suren Baghdasaryan Co-developed-by: Kent Overstreet Signed-off-by: Kent Overstreet Signed-off-by: Klara Modin Tested-by: Kees Cook Cc: Alexander Viro Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Boqun Feng Cc: Christoph Lameter Cc: Dennis Zhou Cc: Gary Guo Cc: Miguel Ojeda Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/vm.rst | 16 ++++ Documentation/filesystems/proc.rst | 29 ++++++ include/asm-generic/codetag.lds.h | 14 +++ include/asm-generic/vmlinux.lds.h | 3 + include/linux/alloc_tag.h | 156 ++++++++++++++++++++++++++++++++ include/linux/sched.h | 24 +++++ lib/Kconfig.debug | 25 +++++ lib/Makefile | 2 + lib/alloc_tag.c | 152 +++++++++++++++++++++++++++++++ scripts/module.lds.S | 7 ++ 10 files changed, 428 insertions(+) create mode 100644 include/asm-generic/codetag.lds.h create mode 100644 include/linux/alloc_tag.h create mode 100644 lib/alloc_tag.c (limited to 'include/linux/sched.h') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index c59889de122b..e86c968a7a0e 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: - legacy_va_layout - lowmem_reserve_ratio - max_map_count +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) - memory_failure_early_kill - memory_failure_recovery - min_free_kbytes @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. The default value is 65530. +mem_profiling +============== + +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) + +1: Enable memory profiling. + +0: Disable memory profiling. + +Enabling memory profiling introduces a small performance overhead for all +memory allocations. + +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. + + memory_failure_early_kill: ========================== diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index c6a6b9df2104..245269dd6e02 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -688,6 +688,7 @@ files are there, and which are missing. ============ =============================================================== File Content ============ =============================================================== + allocinfo Memory allocations profiling information apm Advanced power management info bootconfig Kernel command line obtained from boot config, and, if there were kernel parameters from the @@ -953,6 +954,34 @@ also be allocatable although a lot of filesystem metadata may have to be reclaimed to achieve this. +allocinfo +~~~~~~~~~ + +Provides information about memory allocations at all locations in the code +base. Each allocation in the code is identified by its source file, line +number, module (if originates from a loadable module) and the function calling +the allocation. The number of bytes allocated and number of calls at each +location are reported. + +Example output. + +:: + + > sort -rn /proc/allocinfo + 127664128 31168 mm/page_ext.c:270 func:alloc_page_ext + 56373248 4737 mm/slub.c:2259 func:alloc_slab_page + 14880768 3633 mm/readahead.c:247 func:page_cache_ra_unbounded + 14417920 3520 mm/mm_init.c:2530 func:alloc_large_system_hash + 13377536 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs + 11718656 2861 mm/filemap.c:1919 func:__filemap_get_folio + 9192960 2800 kernel/fork.c:307 func:alloc_thread_stack_node + 4206592 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable + 4136960 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start + 3940352 962 mm/memory.c:4214 func:alloc_anon_folio + 2894464 22613 fs/kernfs/dir.c:615 func:__kernfs_new_node + ... + + meminfo ~~~~~~~ diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h new file mode 100644 index 000000000000..64f536b80380 --- /dev/null +++ b/include/asm-generic/codetag.lds.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_GENERIC_CODETAG_LDS_H +#define __ASM_GENERIC_CODETAG_LDS_H + +#define SECTION_WITH_BOUNDARIES(_name) \ + . = ALIGN(8); \ + __start_##_name = .; \ + KEEP(*(_name)) \ + __stop_##_name = .; + +#define CODETAG_SECTIONS() \ + SECTION_WITH_BOUNDARIES(alloc_tags) + +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f7749d0f2562..3e4497b5135a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -50,6 +50,8 @@ * [__nosave_begin, __nosave_end] for the nosave data */ +#include + #ifndef LOAD_OFFSET #define LOAD_OFFSET 0 #endif @@ -366,6 +368,7 @@ . = ALIGN(8); \ BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ + CODETAG_SECTIONS() \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h new file mode 100644 index 000000000000..13561223fdab --- /dev/null +++ b/include/linux/alloc_tag.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * allocation tagging + */ +#ifndef _LINUX_ALLOC_TAG_H +#define _LINUX_ALLOC_TAG_H + +#include +#include +#include +#include +#include +#include +#include +#include + +struct alloc_tag_counters { + u64 bytes; + u64 calls; +}; + +/* + * An instance of this structure is created in a special ELF section at every + * allocation callsite. At runtime, the special section is treated as + * an array of these. Embedded codetag utilizes codetag framework. + */ +struct alloc_tag { + struct codetag ct; + struct alloc_tag_counters __percpu *counters; +} __aligned(8); + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) +{ + return container_of(ct, struct alloc_tag, ct); +} + +#ifdef ARCH_NEEDS_WEAK_PER_CPU +/* + * When percpu variables are required to be defined as weak, static percpu + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). + * Instead we will accound all module allocations to a single counter. + */ +DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); + +#define DEFINE_ALLOC_TAG(_alloc_tag) \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section("alloc_tags") = { \ + .ct = CODE_TAG_INIT, \ + .counters = &_shared_alloc_tag }; + +#else /* ARCH_NEEDS_WEAK_PER_CPU */ + +#define DEFINE_ALLOC_TAG(_alloc_tag) \ + static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section("alloc_tags") = { \ + .ct = CODE_TAG_INIT, \ + .counters = &_alloc_tag_cntr }; + +#endif /* ARCH_NEEDS_WEAK_PER_CPU */ + +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static inline bool mem_alloc_profiling_enabled(void) +{ + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + &mem_alloc_profiling_key); +} + +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) +{ + struct alloc_tag_counters v = { 0, 0 }; + struct alloc_tag_counters *counter; + int cpu; + + for_each_possible_cpu(cpu) { + counter = per_cpu_ptr(tag->counters, cpu); + v.bytes += counter->bytes; + v.calls += counter->calls; + } + + return v; +} + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG +static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) +{ + WARN_ONCE(ref && ref->ct, + "alloc_tag was not cleared (got tag for %s:%u)\n", + ref->ct->filename, ref->ct->lineno); + + WARN_ONCE(!tag, "current->alloc_tag not set"); +} + +static inline void alloc_tag_sub_check(union codetag_ref *ref) +{ + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); +} +#else +static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {} +static inline void alloc_tag_sub_check(union codetag_ref *ref) {} +#endif + +/* Caller should verify both ref and tag to be valid */ +static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +{ + ref->ct = &tag->ct; + /* + * We need in increment the call counter every time we have a new + * allocation or when we split a large allocation into smaller ones. + * Each new reference for every sub-allocation needs to increment call + * counter because when we free each part the counter will be decremented. + */ + this_cpu_inc(tag->counters->calls); +} + +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) +{ + alloc_tag_add_check(ref, tag); + if (!ref || !tag) + return; + + __alloc_tag_ref_set(ref, tag); + this_cpu_add(tag->counters->bytes, bytes); +} + +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + struct alloc_tag *tag; + + alloc_tag_sub_check(ref); + if (!ref || !ref->ct) + return; + + tag = ct_to_alloc_tag(ref->ct); + + this_cpu_sub(tag->counters->bytes, bytes); + this_cpu_dec(tag->counters->calls); + + ref->ct = NULL; +} + +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +#define DEFINE_ALLOC_TAG(_alloc_tag) +static inline bool mem_alloc_profiling_enabled(void) { return false; } +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, + size_t bytes) {} +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + +#endif /* _LINUX_ALLOC_TAG_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c2abbc587b4..4118b3f959c3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -770,6 +770,10 @@ struct task_struct { unsigned int flags; unsigned int ptrace; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; +#endif + #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; @@ -810,6 +814,7 @@ struct task_struct { struct task_group *sched_task_group; #endif + #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. @@ -2187,4 +2192,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +{ + swap(current->alloc_tag, tag); + return tag; +} + +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; +} +#else +#define alloc_tag_save(_tag) NULL +#define alloc_tag_restore(_tag, _old) do {} while (0) +#endif + #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 015fc6ee9849..fa7aa32ba11a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -972,6 +972,31 @@ config CODE_TAGGING bool select KALLSYMS +config MEM_ALLOC_PROFILING + bool "Enable memory allocation profiling" + default n + depends on PROC_FS + depends on !DEBUG_FORCE_WEAK_PER_CPU + select CODE_TAGGING + help + Track allocation source code and record total allocation size + initiated at that code location. The mechanism can be used to track + memory leaks with a low performance and memory impact. + +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + bool "Enable memory allocation profiling by default" + default y + depends on MEM_ALLOC_PROFILING + +config MEM_ALLOC_PROFILING_DEBUG + bool "Memory allocation profiler debugging" + default n + depends on MEM_ALLOC_PROFILING + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + help + Adds warnings with helpful error messages for memory allocation + profiling. + source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" source "lib/Kconfig.kmsan" diff --git a/lib/Makefile b/lib/Makefile index 910335da8f13..2f4e17bfb299 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -234,6 +234,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-$(CONFIG_CODE_TAGGING) += codetag.o +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o + lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c new file mode 100644 index 000000000000..331dd17650f3 --- /dev/null +++ b/lib/alloc_tag.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +static struct codetag_type *alloc_tag_cttype; + +DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); +EXPORT_SYMBOL(_shared_alloc_tag); + +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static void *allocinfo_start(struct seq_file *m, loff_t *pos) +{ + struct codetag_iterator *iter; + struct codetag *ct; + loff_t node = *pos; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + m->private = iter; + if (!iter) + return NULL; + + codetag_lock_module_list(alloc_tag_cttype, true); + *iter = codetag_get_ct_iter(alloc_tag_cttype); + while ((ct = codetag_next_ct(iter)) != NULL && node) + node--; + + return ct ? iter : NULL; +} + +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + struct codetag *ct = codetag_next_ct(iter); + + (*pos)++; + if (!ct) + return NULL; + + return iter; +} + +static void allocinfo_stop(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; + + if (iter) { + codetag_lock_module_list(alloc_tag_cttype, false); + kfree(iter); + } +} + +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) +{ + struct alloc_tag *tag = ct_to_alloc_tag(ct); + struct alloc_tag_counters counter = alloc_tag_read(tag); + s64 bytes = counter.bytes; + + seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls); + codetag_to_text(out, ct); + seq_buf_putc(out, ' '); + seq_buf_putc(out, '\n'); +} + +static int allocinfo_show(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + char *bufp; + size_t n = seq_get_buf(m, &bufp); + struct seq_buf buf; + + seq_buf_init(&buf, bufp, n); + alloc_tag_to_text(&buf, iter->ct); + seq_commit(m, seq_buf_used(&buf)); + return 0; +} + +static const struct seq_operations allocinfo_seq_op = { + .start = allocinfo_start, + .next = allocinfo_next, + .stop = allocinfo_stop, + .show = allocinfo_show, +}; + +static void __init procfs_init(void) +{ + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); +} + +static bool alloc_tag_module_unload(struct codetag_type *cttype, + struct codetag_module *cmod) +{ + struct codetag_iterator iter = codetag_get_ct_iter(cttype); + struct alloc_tag_counters counter; + bool module_unused = true; + struct alloc_tag *tag; + struct codetag *ct; + + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { + if (iter.cmod != cmod) + continue; + + tag = ct_to_alloc_tag(ct); + counter = alloc_tag_read(tag); + + if (WARN(counter.bytes, + "%s:%u module %s func:%s has %llu allocated at module unload", + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) + module_unused = false; + } + + return module_unused; +} + +static struct ctl_table memory_allocation_profiling_sysctls[] = { + { + .procname = "mem_profiling", + .data = &mem_alloc_profiling_key, +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + .mode = 0444, +#else + .mode = 0644, +#endif + .proc_handler = proc_do_static_key, + }, + { } +}; + +static int __init alloc_tag_init(void) +{ + const struct codetag_type_desc desc = { + .section = "alloc_tags", + .tag_size = sizeof(struct alloc_tag), + .module_unload = alloc_tag_module_unload, + }; + + alloc_tag_cttype = codetag_register_type(&desc); + if (IS_ERR(alloc_tag_cttype)) + return PTR_ERR(alloc_tag_cttype); + + register_sysctl_init("vm", memory_allocation_profiling_sysctls); + procfs_init(); + + return 0; +} +module_init(alloc_tag_init); diff --git a/scripts/module.lds.S b/scripts/module.lds.S index bf5bcf2836d8..45c67a0994f3 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -9,6 +9,8 @@ #define DISCARD_EH_FRAME *(.eh_frame) #endif +#include + SECTIONS { /DISCARD/ : { *(.discard) @@ -47,12 +49,17 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) + CODETAG_SECTIONS() } .rodata : { *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) } +#else + .data : { + CODETAG_SECTIONS() + } #endif } -- cgit v1.2.3-59-g8ed1b From 76edc534cc289308130272a2ac28694fc9b72a03 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 9 May 2024 03:26:28 +0000 Subject: memcg, oom: cleanup unused memcg_oom_gfp_mask and memcg_oom_order Since commit 857f21397f71 ("memcg, oom: remove unnecessary check in mem_cgroup_oom_synchronize()"), memcg_oom_gfp_mask and memcg_oom_order are no longer used any more. Link: https://lkml.kernel.org/r/20240509032628.1217652-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Acked-by: Michal Hocko Acked-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Cc: Benjamin Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 -- mm/memcontrol.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4118b3f959c3..427de5e4754b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1448,8 +1448,6 @@ struct task_struct { #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; - gfp_t memcg_oom_gfp_mask; - int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eef02a59b8c9..7fad15b2290c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2188,8 +2188,6 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) if (current->in_user_fault) { css_get(&memcg->css); current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; } return false; } -- cgit v1.2.3-59-g8ed1b