diff options
author | 2018-11-06 13:57:02 +0100 | |
---|---|---|
committer | 2018-11-06 13:57:02 +0100 | |
commit | 0c7244209588630a9b45e52490ef1390e04499a6 (patch) | |
tree | d60f5610f9b36ff05ed5202396c811628bacdb61 /kernel | |
parent | HID: i2c-hid: add Direkt-Tek DTLAPY133-1 to descriptor override (diff) | |
parent | Merge tag 'platform-drivers-x86-v4.20-1' of git://git.infradead.org/linux-platform-drivers-x86 (diff) | |
download | linux-dev-0c7244209588630a9b45e52490ef1390e04499a6.tar.xz linux-dev-0c7244209588630a9b45e52490ef1390e04499a6.zip |
Merge branch 'master' into for-4.20/upstream-fixes
Pull in a merge commit that brought in 3b692c55e58d ("HID: asus: only
support backlight when it's not driven by WMI") so that fixup could be
applied on top of it.
Diffstat (limited to 'kernel')
55 files changed, 2778 insertions, 1716 deletions
diff --git a/kernel/bounds.c b/kernel/bounds.c index c373e887c066..9795d75b09b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -13,7 +13,7 @@ #include <linux/log2.h> #include <linux/spinlock_types.h> -void foo(void) +int main(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); @@ -23,4 +23,6 @@ void foo(void) #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); /* End of constants */ + + return 0; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 378cef70341c..ee4c82667d65 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2067,56 +2067,47 @@ static int btf_check_sec_info(struct btf_verifier_env *env, return 0; } -static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, - u32 btf_data_size) +static int btf_parse_hdr(struct btf_verifier_env *env) { + u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; - u32 hdr_len, hdr_copy; - /* - * Minimal part of the "struct btf_header" that - * contains the hdr_len. - */ - struct btf_min_header { - u16 magic; - u8 version; - u8 flags; - u32 hdr_len; - } __user *min_hdr; struct btf *btf; int err; btf = env->btf; - min_hdr = btf_data; + btf_data_size = btf->data_size; - if (btf_data_size < sizeof(*min_hdr)) { + if (btf_data_size < + offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } - if (get_user(hdr_len, &min_hdr->hdr_len)) - return -EFAULT; - + hdr = btf->data; + hdr_len = hdr->hdr_len; if (btf_data_size < hdr_len) { btf_verifier_log(env, "btf_header not found"); return -EINVAL; } - err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); - if (err) { - if (err == -E2BIG) - btf_verifier_log(env, "Unsupported btf_header"); - return err; + /* Ensure the unsupported header fields are zero */ + if (hdr_len > sizeof(btf->hdr)) { + u8 *expected_zero = btf->data + sizeof(btf->hdr); + u8 *end = btf->data + hdr_len; + + for (; expected_zero < end; expected_zero++) { + if (*expected_zero) { + btf_verifier_log(env, "Unsupported btf_header"); + return -E2BIG; + } + } } hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); - if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) - return -EFAULT; + memcpy(&btf->hdr, btf->data, hdr_copy); hdr = &btf->hdr; - if (hdr->hdr_len != hdr_len) - return -EINVAL; - btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { @@ -2186,10 +2177,6 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, } env->btf = btf; - err = btf_parse_hdr(env, btf_data, btf_data_size); - if (err) - goto errout; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; @@ -2198,13 +2185,18 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, btf->data = data; btf->data_size = btf_data_size; - btf->nohdr_data = btf->data + btf->hdr.hdr_len; if (copy_from_user(data, btf_data, btf_data_size)) { err = -EFAULT; goto errout; } + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + err = btf_parse_str_sec(env); if (err) goto errout; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c7eeea8cffc..6377225b2082 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -365,10 +365,13 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) } #ifdef CONFIG_BPF_JIT +# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000) + /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; +int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -577,27 +580,64 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +static atomic_long_t bpf_jit_current; + +#if defined(MODULES_VADDR) +static int __init bpf_jit_charge_init(void) +{ + /* Only used as heuristic here to derive limit. */ + bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, + PAGE_SIZE), INT_MAX); + return 0; +} +pure_initcall(bpf_jit_charge_init); +#endif + +static int bpf_jit_charge_modmem(u32 pages) +{ + if (atomic_long_add_return(pages, &bpf_jit_current) > + (bpf_jit_limit >> PAGE_SHIFT)) { + if (!capable(CAP_SYS_ADMIN)) { + atomic_long_sub(pages, &bpf_jit_current); + return -EPERM; + } + } + + return 0; +} + +static void bpf_jit_uncharge_modmem(u32 pages) +{ + atomic_long_sub(pages, &bpf_jit_current); +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - unsigned int size, hole, start; + u32 size, hole, start, pages; /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + pages = size / PAGE_SIZE; + + if (bpf_jit_charge_modmem(pages)) + return NULL; hdr = module_alloc(size); - if (hdr == NULL) + if (!hdr) { + bpf_jit_uncharge_modmem(pages); return NULL; + } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = size / PAGE_SIZE; + hdr->pages = pages; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -610,7 +650,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { + u32 pages = hdr->pages; + module_memfree(hdr); + bpf_jit_uncharge_modmem(pages); } /* This symbol is only overridden by archs that have different diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 141710b82a6c..191b79948424 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -512,8 +512,7 @@ static int dev_map_notification(struct notifier_block *notifier, struct bpf_dtab_netdev *dev, *odev; dev = READ_ONCE(dtab->netdev_map[i]); - if (!dev || - dev->dev->ifindex != netdev->ifindex) + if (!dev || netdev != dev->dev) continue; odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); if (dev == odev) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ab0d5e3f9892..a74972b07e74 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -99,7 +99,6 @@ BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, @@ -113,7 +112,6 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, - .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 12a93fb37449..8bbd72d3a121 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -122,6 +122,7 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } @@ -151,6 +152,7 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) raw_spin_lock_irqsave(&qs->lock, flags); if (queue_stack_map_is_empty(qs)) { + memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98fa0be35370..171a2c88e77d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1387,21 +1387,24 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, enum bpf_access_type t) { switch (env->prog->type) { + /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_REUSEPORT: - /* dst_input() and dst_output() can't write for now */ + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; /* fallthrough */ + + /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -5706,7 +5709,11 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bool is_narrower_load; u32 target_size; - if (ops->gen_prologue) { + if (ops->gen_prologue || env->seen_direct_write) { + if (!ops->gen_prologue) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> +#include <linux/psi.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ }; @@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/compat.c b/kernel/compat.c index 8e40efc2928a..089d00d0da9c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -93,28 +93,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac2824f0b..2a12b988c717 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } +void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->thrashing_start, + ¤t->delays->thrashing_delay, + ¤t->delays->thrashing_count); +} diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 87a6bc2a96c0..22a12ab5a5e9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -4,7 +4,7 @@ * * DMA operations that map physical memory directly without using an IOMMU. */ -#include <linux/bootmem.h> /* for max_pfn */ +#include <linux/memblock.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-direct.h> @@ -14,8 +14,6 @@ #include <linux/pfn.h> #include <linux/set_memory.h> -#define DIRECT_MAPPING_ERROR 0 - /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but * some use it for entirely different regions: diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 4f8a6dbf0b60..5731daa09a32 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,6 +21,7 @@ #include <linux/cache.h> #include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -39,7 +40,7 @@ #include <asm/dma.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/iommu-helper.h> #define CREATE_TRACE_POINTS @@ -73,13 +74,6 @@ static phys_addr_t io_tlb_start, io_tlb_end; static unsigned long io_tlb_nslabs; /* - * When the IOMMU overflows we return a fallback buffer. This sets the size. - */ -static unsigned long io_tlb_overflow = 32*1024; - -static phys_addr_t io_tlb_overflow_buffer; - -/* * This is a free list describing the number of free entries available from * each index */ @@ -126,7 +120,6 @@ setup_io_tlb_npages(char *str) return 0; } early_param("swiotlb", setup_io_tlb_npages); -/* make io_tlb_overflow tunable too? */ unsigned long swiotlb_nr_tbl(void) { @@ -194,16 +187,10 @@ void __init swiotlb_update_mem_attributes(void) bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); memset(vaddr, 0, bytes); - - vaddr = phys_to_virt(io_tlb_overflow_buffer); - bytes = PAGE_ALIGN(io_tlb_overflow); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - void *v_overflow_buffer; unsigned long i, bytes; bytes = nslabs << IO_TLB_SHIFT; @@ -213,25 +200,14 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_end = io_tlb_start + bytes; /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = memblock_virt_alloc_low_nopanic( - PAGE_ALIGN(io_tlb_overflow), - PAGE_SIZE); - if (!v_overflow_buffer) - return -ENOMEM; - - io_tlb_overflow_buffer = __pa(v_overflow_buffer); - - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_virt_alloc( + io_tlb_list = memblock_alloc( PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), PAGE_SIZE); - io_tlb_orig_addr = memblock_virt_alloc( + io_tlb_orig_addr = memblock_alloc( PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { @@ -266,7 +242,7 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; @@ -330,7 +306,6 @@ int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { unsigned long i, bytes; - unsigned char *v_overflow_buffer; bytes = nslabs << IO_TLB_SHIFT; @@ -342,19 +317,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) memset(tlb, 0, bytes); /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!v_overflow_buffer) - goto cleanup2; - - set_memory_decrypted((unsigned long)v_overflow_buffer, - io_tlb_overflow >> PAGE_SHIFT); - memset(v_overflow_buffer, 0, io_tlb_overflow); - io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); - - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. @@ -390,10 +352,6 @@ cleanup4: sizeof(int))); io_tlb_list = NULL; cleanup3: - free_pages((unsigned long)v_overflow_buffer, - get_order(io_tlb_overflow)); - io_tlb_overflow_buffer = 0; -cleanup2: io_tlb_end = 0; io_tlb_start = 0; io_tlb_nslabs = 0; @@ -407,8 +365,6 @@ void __init swiotlb_exit(void) return; if (late_alloc) { - free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), - get_order(io_tlb_overflow)); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * @@ -416,8 +372,6 @@ void __init swiotlb_exit(void) free_pages((unsigned long)phys_to_virt(io_tlb_start), get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - memblock_free_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); memblock_free_late(__pa(io_tlb_list), @@ -429,7 +383,7 @@ void __init swiotlb_exit(void) max_segment = 0; } -int is_swiotlb_buffer(phys_addr_t paddr) +static int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= io_tlb_start && paddr < io_tlb_end; } @@ -591,26 +545,6 @@ found: } /* - * Allocates bounce buffer and returns its physical address. - */ -static phys_addr_t -map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t start_dma_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) { - dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", - &phys); - return SWIOTLB_MAP_ERROR; - } - - start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, - dir, attrs); -} - -/* * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, @@ -689,104 +623,32 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } } -static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, - size_t size) -{ - u64 mask = DMA_BIT_MASK(32); - - if (dev && dev->coherent_dma_mask) - mask = dev->coherent_dma_mask; - return addr + size - 1 <= mask; -} - -static void * -swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned long attrs) +static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - goto out_warn; - - phys_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), - 0, size, DMA_FROM_DEVICE, attrs); - if (phys_addr == SWIOTLB_MAP_ERROR) - goto out_warn; - - *dma_handle = __phys_to_dma(dev, phys_addr); - if (!dma_coherent_ok(dev, *dma_handle, size)) - goto out_unmap; - - memset(phys_to_virt(phys_addr), 0, size); - return phys_to_virt(phys_addr); + dma_addr_t dma_addr; -out_unmap: - dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dev->coherent_dma_mask, - (unsigned long long)*dma_handle); - - /* - * DMA_TO_DEVICE to avoid memcpy in unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); -out_warn: - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { - dev_warn(dev, - "swiotlb: coherent allocation failed, size=%zu\n", - size); - dump_stack(); + if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { + dev_warn_ratelimited(dev, + "Cannot do DMA to address %pa\n", phys); + return DIRECT_MAPPING_ERROR; } - return NULL; -} - -static bool swiotlb_free_buffer(struct device *dev, size_t size, - dma_addr_t dma_addr) -{ - phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); - WARN_ON_ONCE(irqs_disabled()); - - if (!is_swiotlb_buffer(phys_addr)) - return false; - - /* - * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - return true; -} - -static void -swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, - int do_panic) -{ - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * unless they check for dma_mapping_error (most don't) - * When the mapping is small enough return a static buffer to limit - * the damage, or panic when the transfer is too big. - */ - dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", - size); + /* Oh well, have to allocate and map a bounce buffer. */ + *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), + *phys, size, dir, attrs); + if (*phys == SWIOTLB_MAP_ERROR) + return DIRECT_MAPPING_ERROR; - if (size <= io_tlb_overflow || !do_panic) - return; + /* Ensure that the address returned is DMA'ble */ + dma_addr = __phys_to_dma(dev, *phys); + if (unlikely(!dma_capable(dev, dma_addr, size))) { + swiotlb_tbl_unmap_single(dev, *phys, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); + return DIRECT_MAPPING_ERROR; + } - if (dir == DMA_BIDIRECTIONAL) - panic("DMA: Random memory could be DMA accessed\n"); - if (dir == DMA_FROM_DEVICE) - panic("DMA: Random memory could be DMA written\n"); - if (dir == DMA_TO_DEVICE) - panic("DMA: Random memory could be DMA read\n"); + return dma_addr; } /* @@ -801,7 +663,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); BUG_ON(dir == DMA_NONE); @@ -810,28 +672,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) - return dev_addr; - - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - - /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - swiotlb_full(dev, size, dir, 1); - return __phys_to_dma(dev, io_tlb_overflow_buffer); + if (!dma_capable(dev, dev_addr, size) || + swiotlb_force == SWIOTLB_FORCE) { + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + dev_addr = swiotlb_bounce_page(dev, &phys, size, dir, attrs); } - dev_addr = __phys_to_dma(dev, map); + if (!dev_is_dma_coherent(dev) && + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + arch_sync_dma_for_device(dev, phys, size, dir); - /* Ensure that the address returned is DMA'ble */ - if (dma_capable(dev, dev_addr, size)) - return dev_addr; - - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - - return __phys_to_dma(dev, io_tlb_overflow_buffer); + return dev_addr; } /* @@ -842,14 +693,18 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); + if (!dev_is_dma_coherent(hwdev) && + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + arch_sync_dma_for_cpu(hwdev, paddr, size, dir); + if (is_swiotlb_buffer(paddr)) { swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); return; @@ -867,13 +722,6 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, dma_mark_clean(phys_to_virt(paddr), size); } -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unmap_single(hwdev, dev_addr, size, dir, attrs); -} - /* * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. @@ -893,15 +741,17 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(paddr)) { + if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_CPU) + arch_sync_dma_for_cpu(hwdev, paddr, size, dir); + + if (is_swiotlb_buffer(paddr)) swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } - if (dir != DMA_FROM_DEVICE) - return; + if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE) + arch_sync_dma_for_device(hwdev, paddr, size, dir); - dma_mark_clean(phys_to_virt(paddr), size); + if (!is_swiotlb_buffer(paddr) && dir == DMA_FROM_DEVICE) + dma_mark_clean(phys_to_virt(paddr), size); } void @@ -925,48 +775,31 @@ swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ int -swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, +swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - - if (swiotlb_force == SWIOTLB_FORCE || - !dma_capable(hwdev, dev_addr, sg->length)) { - phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - swiotlb_full(hwdev, sg->length, dir, 0); - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sg_dma_len(sgl) = 0; - return 0; - } - sg->dma_address = __phys_to_dma(hwdev, map); - } else - sg->dma_address = dev_addr; + sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset, + sg->length, dir, attrs); + if (sg->dma_address == DIRECT_MAPPING_ERROR) + goto out_error; sg_dma_len(sg) = sg->length; } + return nelems; + +out_error: + swiotlb_unmap_sg_attrs(dev, sgl, i, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sgl) = 0; + return 0; } /* @@ -984,7 +817,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } @@ -1022,12 +855,6 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); } -int -swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) -{ - return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); -} - /* * Return whether the given device DMA address mask can be supported * properly. For example, if your device can only drive the low 24-bits @@ -1040,39 +867,10 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask) return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } -void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - void *vaddr; - - /* temporary workaround: */ - if (gfp & __GFP_NOWARN) - attrs |= DMA_ATTR_NO_WARN; - - /* - * Don't print a warning when the first allocation attempt fails. - * swiotlb_alloc_coherent() will print a warning when the DMA memory - * allocation ultimately failed. - */ - gfp |= __GFP_NOWARN; - - vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); - return vaddr; -} - -void swiotlb_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - if (!swiotlb_free_buffer(dev, size, dma_addr)) - dma_direct_free(dev, size, vaddr, dma_addr, attrs); -} - const struct dma_map_ops swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, - .alloc = swiotlb_alloc, - .free = swiotlb_free, + .mapping_error = dma_direct_mapping_error, + .alloc = dma_direct_alloc, + .free = dma_direct_free, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a97f34bc14c..8c490130c4fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8376,30 +8376,39 @@ static struct pmu perf_tracepoint = { * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe + * + * The following values specify a reference counter (or semaphore in the + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. + * + * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ + PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, + PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); +#endif -static struct attribute *probe_attrs[] = { +#ifdef CONFIG_KPROBE_EVENTS +static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; -static struct attribute_group probe_format_group = { +static struct attribute_group kprobe_format_group = { .name = "format", - .attrs = probe_attrs, + .attrs = kprobe_attrs, }; -static const struct attribute_group *probe_attr_groups[] = { - &probe_format_group, +static const struct attribute_group *kprobe_attr_groups[] = { + &kprobe_format_group, NULL, }; -#endif -#ifdef CONFIG_KPROBE_EVENTS static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, @@ -8409,7 +8418,7 @@ static struct pmu perf_kprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) @@ -8441,6 +8450,24 @@ static int perf_kprobe_event_init(struct perf_event *event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); + +static struct attribute *uprobe_attrs[] = { + &format_attr_retprobe.attr, + &format_attr_ref_ctr_offset.attr, + NULL, +}; + +static struct attribute_group uprobe_format_group = { + .name = "format", + .attrs = uprobe_attrs, +}; + +static const struct attribute_group *uprobe_attr_groups[] = { + &uprobe_format_group, + NULL, +}; + static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, @@ -8450,12 +8477,13 @@ static struct pmu perf_uprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; + unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) @@ -8471,7 +8499,8 @@ static int perf_uprobe_event_init(struct perf_event *event) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; - err = perf_uprobe_init(event, is_retprobe); + ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2bf792d22087..96d4bee83489 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,7 @@ struct uprobe { struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; + loff_t ref_ctr_offset; unsigned long flags; /* @@ -88,6 +89,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct delayed_uprobe { + struct list_head list; + struct uprobe *uprobe; + struct mm_struct *mm; +}; + +static DEFINE_MUTEX(delayed_uprobe_lock); +static LIST_HEAD(delayed_uprobe_list); + /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -282,6 +292,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t return 1; } +static struct delayed_uprobe * +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + list_for_each_entry(du, &delayed_uprobe_list, list) + if (du->uprobe == uprobe && du->mm == mm) + return du; + return NULL; +} + +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + if (delayed_uprobe_check(uprobe, mm)) + return 0; + + du = kzalloc(sizeof(*du), GFP_KERNEL); + if (!du) + return -ENOMEM; + + du->uprobe = uprobe; + du->mm = mm; + list_add(&du->list, &delayed_uprobe_list); + return 0; +} + +static void delayed_uprobe_delete(struct delayed_uprobe *du) +{ + if (WARN_ON(!du)) + return; + list_del(&du->list); + kfree(du); +} + +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + + if (!uprobe && !mm) + return; + + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (uprobe && du->uprobe != uprobe) + continue; + if (mm && du->mm != mm) + continue; + + delayed_uprobe_delete(du); + } +} + +static bool valid_ref_ctr_vma(struct uprobe *uprobe, + struct vm_area_struct *vma) +{ + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); + + return uprobe->ref_ctr_offset && + vma->vm_file && + file_inode(vma->vm_file) == uprobe->inode && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + vma->vm_start <= vaddr && + vma->vm_end > vaddr; +} + +static struct vm_area_struct * +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *tmp; + + for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + if (valid_ref_ctr_vma(uprobe, tmp)) + return tmp; + + return NULL; +} + +static int +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) +{ + void *kaddr; + struct page *page; + struct vm_area_struct *vma; + int ret; + short *ptr; + + if (!vaddr || !d) + return -EINVAL; + + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + FOLL_WRITE, &page, &vma, NULL); + if (unlikely(ret <= 0)) { + /* + * We are asking for 1 page. If get_user_pages_remote() fails, + * it may return 0, in that case we have to return error. + */ + return ret == 0 ? -EBUSY : ret; + } + + kaddr = kmap_atomic(page); + ptr = kaddr + (vaddr & ~PAGE_MASK); + + if (unlikely(*ptr + d < 0)) { + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " + "curr val: %d, delta: %d\n", vaddr, *ptr, d); + ret = -EINVAL; + goto out; + } + + *ptr += d; + ret = 0; +out: + kunmap_atomic(kaddr); + put_page(page); + return ret; +} + +static void update_ref_ctr_warn(struct uprobe *uprobe, + struct mm_struct *mm, short d) +{ + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, + (unsigned long long) uprobe->offset, + (unsigned long long) uprobe->ref_ctr_offset, mm); +} + +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, + short d) +{ + struct vm_area_struct *rc_vma; + unsigned long rc_vaddr; + int ret = 0; + + rc_vma = find_ref_ctr_vma(uprobe, mm); + + if (rc_vma) { + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); + ret = __update_ref_ctr(mm, rc_vaddr, d); + if (ret) + update_ref_ctr_warn(uprobe, mm, d); + + if (d > 0) + return ret; + } + + mutex_lock(&delayed_uprobe_lock); + if (d > 0) + ret = delayed_uprobe_add(uprobe, mm); + else + delayed_uprobe_remove(uprobe, mm); + mutex_unlock(&delayed_uprobe_lock); + + return ret; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -302,9 +472,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { + struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; - int ret; + int ret, is_register, ref_ctr_updated = 0; + + is_register = is_swbp_insn(&opcode); + uprobe = container_of(auprobe, struct uprobe, arch); retry: /* Read the page with vaddr into memory */ @@ -317,6 +491,15 @@ retry: if (ret <= 0) goto put_old; + /* We are going to replace instruction, update ref_ctr. */ + if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); + if (ret) + goto put_old; + + ref_ctr_updated = 1; + } + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -337,6 +520,11 @@ put_old: if (unlikely(ret == -EAGAIN)) goto retry; + + /* Revert back reference counter if instruction update failed. */ + if (ret && is_register && ref_ctr_updated) + update_ref_ctr(uprobe, mm, -1); + return ret; } @@ -378,8 +566,15 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) + if (atomic_dec_and_test(&uprobe->ref)) { + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + delayed_uprobe_remove(uprobe, NULL); kfree(uprobe); + } } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -484,7 +679,18 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +static void +ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) +{ + pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", + uprobe->inode->i_ino, (unsigned long long) uprobe->offset, + (unsigned long long) cur_uprobe->ref_ctr_offset, + (unsigned long long) uprobe->ref_ctr_offset); +} + +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; @@ -494,6 +700,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = inode; uprobe->offset = offset; + uprobe->ref_ctr_offset = ref_ctr_offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -501,6 +708,12 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { + if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { + ref_ctr_mismatch_warn(cur_uprobe, uprobe); + put_uprobe(cur_uprobe); + kfree(uprobe); + return ERR_PTR(-EINVAL); + } kfree(uprobe); uprobe = cur_uprobe; } @@ -895,7 +1108,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); * else return 0 (success) */ static int __uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) + loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -912,9 +1125,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; retry: - uprobe = alloc_uprobe(inode, offset); + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) return -ENOMEM; + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); + /* * We can race with uprobe_unregister()->delete_uprobe(). * Check uprobe_is_active() and retry if it is false. @@ -938,10 +1154,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset, int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { - return __uprobe_register(inode, offset, uc); + return __uprobe_register(inode, offset, 0, uc); } EXPORT_SYMBOL_GPL(uprobe_register); +int uprobe_register_refctr(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset, struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, ref_ctr_offset, uc); +} +EXPORT_SYMBOL_GPL(uprobe_register_refctr); + /* * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. @@ -1060,6 +1283,35 @@ static void build_probe_list(struct inode *inode, spin_unlock(&uprobes_treelock); } +/* @vma contains reference counter, not the probed instruction. */ +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + unsigned long vaddr; + int ret = 0, err = 0; + + mutex_lock(&delayed_uprobe_lock); + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (du->mm != vma->vm_mm || + !valid_ref_ctr_vma(du->uprobe, vma)) + continue; + + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); + if (ret) { + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); + if (!err) + err = ret; + } + delayed_uprobe_delete(du); + } + mutex_unlock(&delayed_uprobe_lock); + return err; +} + /* * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * @@ -1072,7 +1324,15 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (no_uprobe_events() || !valid_vma(vma, true)) + if (no_uprobe_events()) + return 0; + + if (vma->vm_file && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + delayed_ref_ctr_inc(vma); + + if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); @@ -1246,6 +1506,10 @@ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(NULL, mm); + mutex_unlock(&delayed_uprobe_lock); + if (!area) return; diff --git a/kernel/fail_function.c b/kernel/fail_function.c index bc80a4e268c0..17f75b545f66 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -173,8 +173,7 @@ static void fei_debugfs_remove_attr(struct fei_attr *attr) struct dentry *dir; dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); - if (dir) - debugfs_remove_recursive(dir); + debugfs_remove_recursive(dir); } static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) diff --git a/kernel/fork.c b/kernel/fork.c index f0b58479534f..8f82a3bdcb8f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } + /* + * Allocated stacks are cached and later reused by new threads, + * so memcg accounting is performed manually on assigning/releasing + * stacks to tasks. Drop __GFP_ACCOUNT. + */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, + THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK - if (task_stack_vm_area(tsk)) { + struct vm_struct *vm = task_stack_vm_area(tsk); + + if (vm) { int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + -(int)(PAGE_SIZE / 1024)); + + memcg_kmem_uncharge(vm->pages[i], 0); + } + for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) NR_KERNEL_STACK_KB, PAGE_SIZE / 1024 * account); } - - /* All stack pages belong to the same memcg. */ - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ +#ifdef CONFIG_VMAP_STACK + struct vm_struct *vm = task_stack_vm_area(tsk); + int ret; + + if (vm) { + int i; + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + /* + * If memcg_kmem_charge() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge() + * and mod_memcg_page_state() in free_thread_stack() + * will ignore this page. So it's safe. + */ + ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + if (ret) + return ret; + + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + PAGE_SIZE / 1024); + } + } +#endif + return 0; +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(tsk->state != TASK_DEAD)) @@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + if (memcg_charge_kernel_stack(tsk)) + goto free_stack; + stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); @@ -1779,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process( p->default_timer_slack_ns = current->timer_slack_ns; +#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/futex.c b/kernel/futex.c index 3e2de8fc1891..f423f9b6577e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -65,7 +65,7 @@ #include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/freezer.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/fault-inject.h> #include <asm/futex.h> diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83f830acbb5f..410a77a8f6e2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -173,7 +173,7 @@ err_unlock: } COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct compat_timespec __user *, utime, u32 __user *, uaddr2, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec ts; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b9132d1269ef..cb8e3e8ac7b9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include <linux/lockdep.h> #include <linux/export.h> #include <linux/sysctl.h> +#include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> @@ -242,6 +243,28 @@ void reset_hung_task_detector(void) } EXPORT_SYMBOL_GPL(reset_hung_task_detector); +static bool hung_detector_suspended; + +static int hungtask_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + hung_detector_suspended = true; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + hung_detector_suspended = false; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * kthread which checks for tasks stuck in D state */ @@ -261,7 +284,8 @@ static int watchdog(void *dummy) interval = min_t(unsigned long, interval, timeout); t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { - if (!atomic_xchg(&reset_hung_task, 0)) + if (!atomic_xchg(&reset_hung_task, 0) && + !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; @@ -275,6 +299,10 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + /* Disable hung task detector on suspend */ + pm_notifier(hungtask_pm_notify, 0); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5d9fc01b60a6..3366d11c3e02 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -183,7 +183,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kstrdup(of_node_full_name(of_node), GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "%pOF", of_node); if (!name) { kfree(domain); return NULL; @@ -867,7 +867,7 @@ void irq_dispose_mapping(unsigned int virq) EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** - * irq_find_mapping() - Find a linux irq from an hw irq number. + * irq_find_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space */ @@ -1741,6 +1741,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { debugfs_remove(d->debugfs_file); + d->debugfs_file = NULL; } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fb86146037a7..9dbdccab3b6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -927,6 +927,9 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); local_bh_enable(); return ret; @@ -943,6 +946,9 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, irqreturn_t ret; ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); return ret; } @@ -1020,8 +1026,6 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (action_ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 02a0b01380d8..f3a04994e063 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -37,7 +37,7 @@ extern const u8 kallsyms_names[] __weak; * Tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV). */ -extern const unsigned long kallsyms_num_syms +extern const unsigned int kallsyms_num_syms __attribute__((weak, section(".rodata"))); extern const unsigned long kallsyms_relative_base @@ -46,7 +46,7 @@ __attribute__((weak, section(".rodata"))); extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __weak; +extern const unsigned int kallsyms_markers[] __weak; /* * Expand a compressed symbol data into the resulting uncompressed string, diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 0130e488ebfe..8f36c27c1794 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,7 +4,7 @@ #endif #include <linux/hash.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/debug_locks.h> /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..9eced2cc9f94 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,47 +1,21 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include <linux/radix-tree.h> #include <linux/device.h> -#include <linux/types.h> -#include <linux/pfn_t.h> #include <linux/io.h> #include <linux/kasan.h> -#include <linux/mm.h> #include <linux/memory_hotplug.h> +#include <linux/mm.h> +#include <linux/pfn_t.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/types.h> #include <linux/wait_bit.h> +#include <linux/xarray.h> -static DEFINE_MUTEX(pgmap_lock); -static RADIX_TREE(pgmap_radix, GFP_KERNEL); +static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -static unsigned long order_at(struct resource *res, unsigned long pgoff) -{ - unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; - unsigned long nr_pages, mask; - - nr_pages = PHYS_PFN(resource_size(res)); - if (nr_pages == pgoff) - return ULONG_MAX; - - /* - * What is the largest aligned power-of-2 range available from - * this resource pgoff to the end of the resource range, - * considering the alignment of the current pgoff? - */ - mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); - if (!mask) - return ULONG_MAX; - - return find_first_bit(&mask, BITS_PER_LONG); -} - -#define foreach_order_pgoff(res, order, pgoff) \ - for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ - pgoff += 1UL << order, order = order_at((res), pgoff)) - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -70,18 +44,10 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) +static void pgmap_array_delete(struct resource *res) { - unsigned long pgoff, order; - - mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) { - if (pgoff >= end_pgoff) - break; - radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); - } - mutex_unlock(&pgmap_lock); - + xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + NULL, GFP_KERNEL); synchronize_rcu(); } @@ -142,7 +108,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res, -1); + pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -175,10 +141,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -216,20 +181,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgmap->dev = dev; - mutex_lock(&pgmap_lock); - error = 0; - - foreach_order_pgoff(res, order, pgoff) { - error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, pgmap); - if (error) { - dev_err(dev, "%s: failed: %d\n", __func__, error); - break; - } - } - mutex_unlock(&pgmap_lock); + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), + PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) - goto err_radix; + goto err_array; nid = dev_to_node(dev); if (nid < 0) @@ -256,19 +211,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); @@ -279,8 +229,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: - err_radix: - pgmap_radix_release(res, pgoff); + pgmap_array_delete(res); + err_array: return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -320,7 +270,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); diff --git a/kernel/panic.c b/kernel/panic.c index 8b2e002d52eb..f6d549a29a5c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -136,7 +136,7 @@ void panic(const char *fmt, ...) { static char buf[1024]; va_list args; - long i, i_next = 0; + long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; @@ -173,8 +173,12 @@ void panic(const char *fmt, ...) console_verbose(); bust_spinlocks(1); va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* @@ -631,7 +635,7 @@ device_initcall(register_warn_debugfs); */ __visible void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted in: %pB\n", + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/pid.c b/kernel/pid.c index cdf63e53a014..b2f6c506035d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -31,7 +31,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3d37c279c090..b0308a2c6000 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -23,7 +23,7 @@ #include <linux/pm.h> #include <linux/device.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/nmi.h> #include <linux/syscalls.h> #include <linux/console.h> @@ -963,7 +963,8 @@ void __init __register_nosave_region(unsigned long start_pfn, BUG_ON(!region); } else { /* This allocation cannot fail */ - region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + region = memblock_alloc(sizeof(struct nosave_region), + SMP_CACHE_BYTES); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9bf5404397e0..1b2a029360b7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -16,6 +16,8 @@ * 01Mar01 Andrew Morton */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> @@ -29,7 +31,6 @@ #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> -#include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/crash_core.h> @@ -192,16 +193,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } -/* - * Number of registered extended console drivers. - * - * If extended consoles are present, in-kernel cont reassembly is disabled - * and each fragment is stored as a separate log entry with proper - * continuation flag so that every emitted message has full metadata. This - * doesn't change the result for regular consoles or /proc/kmsg. For - * /dev/kmsg, as long as the reader concatenates messages according to - * consecutive continuation flags, the end result should be the same too. - */ +/* Number of registered extended console drivers. */ static int nr_ext_console_drivers; /* @@ -423,6 +415,7 @@ static u32 log_next_idx; /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; +static u64 exclusive_console_stop_seq; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; @@ -437,6 +430,7 @@ static u32 clear_idx; /* record buffer */ #define LOG_ALIGN __alignof__(struct printk_log) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -1037,18 +1031,28 @@ void log_buf_vmcoreinfo_setup(void) static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ -static void __init log_buf_len_update(unsigned size) +static void __init log_buf_len_update(u64 size) { + if (size > (u64)LOG_BUF_LEN_MAX) { + size = (u64)LOG_BUF_LEN_MAX; + pr_err("log_buf over 2G is not supported.\n"); + } + if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) - new_log_buf_len = size; + new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { - unsigned size = memparse(str, &str); + u64 size; + + if (!str) + return -EINVAL; + + size = memparse(str, &str); log_buf_len_update(size); @@ -1093,7 +1097,7 @@ void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; - int free; + unsigned int free; if (log_buf != __log_buf) return; @@ -1106,14 +1110,14 @@ void __init setup_log_buf(int early) if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); + memblock_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + new_log_buf = memblock_alloc_nopanic(new_log_buf_len, LOG_ALIGN); } if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %ld bytes not available\n", + pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); return; } @@ -1126,8 +1130,8 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); logbuf_unlock_irqrestore(flags); - pr_info("log_buf_len: %d bytes\n", log_buf_len); - pr_info("early log buf free: %d(%d%%)\n", + pr_info("log_buf_len: %u bytes\n", log_buf_len); + pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -1767,12 +1771,8 @@ static void cont_flush(void) static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { - /* - * If ext consoles are present, flush and skip in-kernel - * continuation. See nr_ext_console_drivers definition. Also, if - * the line gets too long, split it up in separate records. - */ - if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { + /* If the line gets too long, split it up in separate records. */ + if (cont.len + len > sizeof(cont.buf)) { cont_flush(); return false; } @@ -1795,9 +1795,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * cont_flush(); } - if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(); - return true; } @@ -1889,8 +1886,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; - bool in_sched = false; + bool in_sched = false, pending_output; unsigned long flags; + u64 curr_log_seq; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; @@ -1902,11 +1900,13 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); + curr_log_seq = log_next_seq; printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); + pending_output = (curr_log_seq != log_next_seq); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (!in_sched && pending_output) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -1923,7 +1923,8 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - wake_up_klogd(); + if (pending_output) + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2009,6 +2010,7 @@ static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; static u32 console_idx; +static u64 exclusive_console_stop_seq; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; @@ -2351,8 +2353,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, "** %u printk messages dropped **\n", - (unsigned)(log_first_seq - console_seq)); + len = sprintf(text, + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; @@ -2376,6 +2379,12 @@ skip: goto skip; } + /* Output to all consoles once old messages replayed. */ + if (unlikely(exclusive_console && + console_seq >= exclusive_console_stop_seq)) { + exclusive_console = NULL; + } + len += msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG, text + len, @@ -2418,10 +2427,6 @@ skip: console_locked = 0; - /* Release the exclusive_console once it is used */ - if (unlikely(exclusive_console)) - exclusive_console = NULL; - raw_spin_unlock(&logbuf_lock); up_console_sem(); @@ -2688,8 +2693,7 @@ void register_console(struct console *newcon) } if (newcon->flags & CON_EXTENDED) - if (!nr_ext_console_drivers++) - pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + nr_ext_console_drivers++; if (newcon->flags & CON_PRINTBUFFER) { /* @@ -2699,13 +2703,18 @@ void register_console(struct console *newcon) logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; - logbuf_unlock_irqrestore(flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to * the already-registered consoles. + * + * Set exclusive_console with disabled interrupts to reduce + * race window with eventual console_flush_on_panic() that + * ignores console_lock. */ exclusive_console = newcon; + exclusive_console_stop_seq = console_seq; + logbuf_unlock_irqrestore(flags); } console_unlock(); console_sysfs_notify(); diff --git a/kernel/profile.c b/kernel/profile.c index 9aa2a4445b0d..9c08a2c7cb1d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -16,7 +16,7 @@ #include <linux/export.h> #include <linux/profile.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/notifier.h> #include <linux/mm.h> #include <linux/cpumask.h> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe0223121883..f12225f26b70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -2876,6 +2881,18 @@ unsigned long long nr_context_switches(void) } /* + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. + */ + +unsigned long nr_iowait_cpu(int cpu) +{ + return atomic_read(&cpu_rq(cpu)->nr_iowait); +} + +/* * IO-wait accounting, and how its mostly bollocks (on SMP). * * The idea behind IO-wait account is to account the idle time that we could @@ -2910,31 +2927,11 @@ unsigned long nr_iowait(void) unsigned long i, sum = 0; for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); + sum += nr_iowait_cpu(i); return sum; } -/* - * Consumers of these two interfaces, like for example the cpuidle menu - * governor, are using nonsensical data. Preferring shallow idle state selection - * for a CPU that has IO-wait which might not even end up running the task when - * it does become runnable. - */ - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *rq = this_rq(); - *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->load.weight; -} - #ifdef CONFIG_SMP /* @@ -3051,6 +3048,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -4933,9 +4931,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq; - local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -5244,7 +5240,7 @@ out_unlock: * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5255,16 +5251,16 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return retval; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, - struct compat_timespec __user *, interval) + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif @@ -6069,6 +6065,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,19 +91,73 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. */ static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) { - unsigned long newload; + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; + return result; +} - return newload / FIXED_1; +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); } #ifdef CONFIG_NO_HZ_COMMON @@ -225,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..7cdecfc010af --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,759 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner <hannes@cmpxchg.org> + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include <linux/sched/loadavg.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/seqlock.h> +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/psi.h> +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group *group; + void *iter = NULL; + + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b8c007713b3b..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -54,6 +54,7 @@ #include <linux/proc_fs.h> #include <linux/prefetch.h> #include <linux/profile.h> +#include <linux/psi.h> #include <linux/rcupdate_wait.h> #include <linux/security.h> #include <linux/stop_machine.h> @@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include <linux/cgroup.h> +#include <linux/psi.h> struct cfs_rq; struct rt_rq; @@ -957,6 +959,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1075,6 +1079,98 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1717,8 +1813,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1783,86 +1877,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { diff --git a/kernel/signal.c b/kernel/signal.c index dbd2e4db24cf..9a32bc2088c9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -892,7 +892,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and - * notify its parent. See get_signal_to_deliver(). + * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; @@ -3165,7 +3165,7 @@ int copy_siginfo_from_user32(struct kernel_siginfo *to, * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, - const struct timespec *ts) + const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; @@ -3173,9 +3173,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, int sig, ret = 0; if (ts) { - if (!timespec_valid(ts)) + if (!timespec64_valid(ts)) return -EINVAL; - timeout = timespec_to_ktime(*ts); + timeout = timespec64_to_ktime(*ts); to = &timeout; } @@ -3223,11 +3223,12 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, + siginfo_t __user *, uinfo, + const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; - struct timespec ts; + struct timespec64 ts; kernel_siginfo_t info; int ret; @@ -3239,7 +3240,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) + if (get_timespec64(&ts, uts)) return -EFAULT; } @@ -3256,10 +3257,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) + struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; - struct timespec t; + struct timespec64 t; kernel_siginfo_t info; long ret; @@ -3270,7 +3271,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (compat_get_timespec(&t, uts)) + if (get_old_timespec32(&t, uts)) return -EFAULT; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 7a0720a20003..d28813306b2c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -257,9 +257,9 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) int softirq_bit; /* - * Mask out PF_MEMALLOC s current task context is borrowed for the - * softirq. A softirq handled such as network RX might set PF_MEMALLOC - * again if the socket is related to swap + * Mask out PF_MEMALLOC as the current task context is borrowed for the + * softirq. A softirq handled, such as network RX, might set PF_MEMALLOC + * again if the socket is related to swapping. */ current->flags &= ~PF_MEMALLOC; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1a549c9e399..9cdd74bd2d27 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1660,7 +1660,7 @@ int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: - if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif @@ -1780,12 +1780,12 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; - if (compat_get_timespec64(&tu, rqtp)) + if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2c6847d5d69b..989ccf028bde 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -162,20 +162,20 @@ COMPAT_SYS_NI(setitimer); #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec64(&new_tp, tp)) + if (get_old_timespec32(&new_tp, tp)) return -EFAULT; return do_sys_settimeofday64(&new_tp, NULL); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -184,13 +184,13 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, if (ret) return ret; - if (compat_put_timespec64(&kernel_tp, tp)) + if (put_old_timespec32(&kernel_tp, tp)) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -201,7 +201,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec64(&rtn_tp, tp)) + if (put_old_timespec32(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -210,8 +210,8 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; @@ -224,7 +224,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index eabb4c22728d..bd62b5eeb5a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -755,13 +755,13 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - if (put_compat_itimerspec64(&cur_setting, setting)) + if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -928,8 +928,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -937,12 +937,12 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!new) return -EINVAL; - if (get_compat_itimerspec64(&new_spec, new)) + if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - if (put_compat_itimerspec64(&old_spec, old)) + if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; @@ -1115,7 +1115,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,14 +1123,14 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec64(&ts, tp)) + if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1141,7 +1141,7 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, err = kc->clock_get(which_clock, &ts); - if (!err && compat_put_timespec64(&ts, tp)) + if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; @@ -1180,7 +1180,7 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1190,7 +1190,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return -EINVAL; err = kc->clock_getres(which_clock, &ts); - if (!err && tp && compat_put_timespec64(&ts, tp)) + if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; @@ -1237,8 +1237,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1248,7 +1248,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc->nsleep) return -EOPNOTSUPP; - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aa2094d5dd27..be0aac2b4300 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -400,8 +400,6 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); - if (!tick_device_is_functional(dev)) - break; if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5b33e2f5c0ed..69e673b88474 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -885,7 +885,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (need_resched()) return false; - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + if (unlikely(local_softirq_pending())) { static int ratelimit; if (ratelimit < 10 && diff --git a/kernel/time/time.c b/kernel/time/time.c index ccdb351277ee..e3a7f7fd3abc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -104,12 +104,12 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_TIME -/* compat_time_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +/* old_time32_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) { - compat_time_t i; + old_time32_t i; - i = (compat_time_t)ktime_get_real_seconds(); + i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -119,7 +119,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -144,9 +144,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { @@ -223,14 +225,15 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { - struct timeval ktv; + struct timespec64 ts; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { @@ -241,7 +244,7 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, return 0; } -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; @@ -342,30 +345,6 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - /* * mktime64 - Converts date to seconds. * Converts Gregorian date to seconds since 1970-01-01 00:00:00. @@ -884,10 +863,10 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) +static int __get_old_timespec32(struct timespec64 *ts64, + const struct old_timespec32 __user *cts) { - struct compat_timespec ts; + struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); @@ -900,33 +879,33 @@ int __compat_get_timespec64(struct timespec64 *ts64, return 0; } -int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) +static int __put_old_timespec32(const struct timespec64 *ts64, + struct old_timespec32 __user *cts) { - struct compat_timespec ts = { + struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_get_timespec64(ts, uts); + return __get_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_get_timespec64); +EXPORT_SYMBOL_GPL(get_old_timespec32); -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_put_timespec64(ts, uts); + return __put_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_put_timespec64); +EXPORT_SYMBOL_GPL(put_old_timespec32); int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -958,23 +937,23 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); -int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits) +int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits) { - if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || - __compat_get_timespec64(&its->it_value, &uits->it_value)) + if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || + __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(get_compat_itimerspec64); +EXPORT_SYMBOL_GPL(get_old_itimerspec32); -int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits) +int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits) { - if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || - __compat_put_timespec64(&its->it_value, &uits->it_value)) + if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || + __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(put_compat_itimerspec64); +EXPORT_SYMBOL_GPL(put_old_itimerspec32); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f3b22f456fac..2d110c948805 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1212,22 +1212,6 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec64 now; - - getnstimeofday64(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} -EXPORT_SYMBOL(do_gettimeofday); - -/** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * @@ -2174,14 +2158,6 @@ void getboottime64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(getboottime64); -unsigned long get_seconds(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return tk->xtime_sec; -} -EXPORT_SYMBOL(get_seconds); - void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bf6f1d70484d..ff1c4b20cd0a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2727,6 +2727,7 @@ void trace_dump_stack(int skip) __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } +EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); @@ -4621,13 +4622,18 @@ static const char readme_msg[] = "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS - "\t place: <path>:<offset>\n" + " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + "\t $stack<index>, $stack, $retval, $comm, $arg<N>\n" +#else "\t $stack<index>, $stack, $retval, $comm\n" - "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" - "\t b<bit-width>@<bit-offset>/<container-size>\n" +#endif + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" + "\t b<bit-width>@<bit-offset>/<container-size>,\n" + "\t <type>\\[<array-size>\\]\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 69a3fe926e8c..76217bbef815 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -290,7 +290,8 @@ void perf_kprobe_destroy(struct perf_event *p_event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS -int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) +int perf_uprobe_init(struct perf_event *p_event, + unsigned long ref_ctr_offset, bool is_retprobe) { int ret; char *path = NULL; @@ -312,8 +313,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } - tp_event = create_local_trace_uprobe( - path, p_event->attr.probe_offset, is_retprobe); + tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, + ref_ctr_offset, is_retprobe); if (IS_ERR(tp_event)) { ret = PTR_ERR(tp_event); goto out; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d239004aaf29..eb908ef2ecec 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1063,8 +1063,10 @@ static int create_synth_event(int argc, char **argv) event = NULL; ret = -EEXIST; goto out; - } else if (delete_event) + } else if (delete_event) { + ret = -ENOENT; goto out; + } if (argc < 2) { ret = -EINVAL; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c30032367aab..fec67188c4d2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -14,6 +14,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -61,9 +62,23 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { - return !!strchr(trace_kprobe_symbol(tk), ':'); + char *p; + bool ret; + + if (!tk->symbol) + return false; + p = strchr(tk->symbol, ':'); + if (!p) + return true; + *p = '\0'; + mutex_lock(&module_mutex); + ret = !!find_module(tk->symbol); + mutex_unlock(&module_mutex); + *p = ':'; + + return ret; } static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) @@ -120,184 +135,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -/* - * Kprobes-specific fetch functions - */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); - -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); - -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(dst, addr, maxlen); - - if (ret < 0) { /* Failed to fetch string */ - dst[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); - } -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); - -/* Return the length of string -- including null terminal byte */ -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); - -#define DEFINE_FETCH_symbol(type) \ -void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); - -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - -/* kprobes don't support file_offset fetch methods */ -#define fetch_file_offset_u8 NULL -#define fetch_file_offset_u16 NULL -#define fetch_file_offset_u32 NULL -#define fetch_file_offset_u64 NULL -#define fetch_file_offset_string NULL -#define fetch_file_offset_string_size NULL - -/* Fetch type information table */ -static const struct fetch_type kprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; - /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -540,8 +377,11 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; } - for (i = 0; i < tk->tp.nr_args; i++) - traceprobe_update_arg(&tk->tp.args[i]); + for (i = 0; i < tk->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tk->tp.args[i]); + if (ret) + return ret; + } /* Set/clear disabled flag according to tp->flag */ if (trace_probe_is_enabled(&tk->tp)) @@ -554,19 +394,13 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) + if (ret == 0) { tk->tp.flags |= TP_FLAG_REGISTERED; - else { - if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); - ret = 0; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } + } else if (ret == -EILSEQ) { + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); + ret = -EINVAL; } - return ret; } @@ -629,6 +463,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register k*probe */ ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + if (ret < 0) unregister_kprobe_event(tk); else @@ -713,13 +552,15 @@ static int create_trace_kprobe(int argc, char **argv) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + unsigned int flags = TPARG_FL_KERNEL; /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = false; - else if (argv[0][0] == 'r') + else if (argv[0][0] == 'r') { is_return = true; - else if (argv[0][0] == '-') + flags |= TPARG_FL_RETURN; + } else if (argv[0][0] == '-') is_delete = true; else { pr_info("Probe definition must be started with 'p', 'r' or" @@ -749,10 +590,13 @@ static int create_trace_kprobe(int argc, char **argv) } if (event) { - if (strchr(event, '/')) { + char *slash; + + slash = strchr(event, '/'); + if (slash) { group = event; - event = strchr(group, '/') + 1; - event[-1] = '\0'; + event = slash + 1; + slash[0] = '\0'; if (strlen(group) == 0) { pr_info("Group name is not specified\n"); return -EINVAL; @@ -802,8 +646,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse either an address or a symbol.\n"); return ret; } - if (offset && is_return && - !kprobe_on_func_entry(NULL, symbol, offset)) { + if (kprobe_on_func_entry(NULL, symbol, offset)) + flags |= TPARG_FL_FENTRY; + if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } @@ -873,8 +718,7 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true, - kprobes_fetch_type_table); + flags); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -1028,6 +872,106 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Kprobe specific fetch functions */ + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + u8 *dst = get_loc_data(dest, base); + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_unsafe(dst, (void *)addr, maxlen); + + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + return ret; +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + return probe_kernel_read(dest, src, size); +} + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + void *base) +{ + unsigned long val; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + /* Kprobe handler */ static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, @@ -1059,7 +1003,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry = ring_buffer_event_data(event); entry->ip = (unsigned long)tk->rp.kp.addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1108,7 +1052,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); event_trigger_unlock_commit_regs(trace_file, buffer, event, entry, irq_flags, pc, regs); @@ -1133,8 +1077,6 @@ print_kprobe_event(struct trace_iterator *iter, int flags, struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1146,11 +1088,9 @@ print_kprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); out: @@ -1164,8 +1104,6 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; - u8 *data; - int i; field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); @@ -1182,11 +1120,9 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, trace_seq_putc(s, ')'); - data = (u8 *)&field[1]; - for (i = 0; i < tp->nr_args; i++) - if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset, field)) - goto out; + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; trace_seq_putc(s, '\n'); @@ -1197,49 +1133,25 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, static int kprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } static int kretprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i; + int ret; struct kretprobe_trace_entry_head field; struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - /* Set argument names as fields */ - for (i = 0; i < tk->tp.nr_args; i++) { - struct probe_arg *parg = &tk->tp.args[i]; - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, - sizeof(field) + parg->offset, - parg->type->size, - parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, sizeof(field), &tk->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1286,7 +1198,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -1322,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -1457,7 +1369,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) init_trace_event_call(tk, call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); if (!ret) { @@ -1514,7 +1426,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk, &tk->tp.call); - if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index b0875b327f5c..c3fd849d4a8f 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -115,7 +115,7 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, * section, then we need to read the link list pointers. The trick is * we pass the address of the string to the seq function just like * we do for the kernel core formats. To get back the structure that - * holds the format, we simply use containerof() and then go to the + * holds the format, we simply use container_of() and then go to the * next format in the list. */ static const char ** diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e99c3ce7aa65..3ef15a6683c0 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -26,14 +26,12 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ -int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\ { \ - trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, fmt, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") @@ -48,193 +46,52 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent) +{ + trace_seq_printf(s, "%pS", (void *)*(unsigned long *)data); + return !trace_seq_has_overflowed(s); +} +const char PRINT_TYPE_FMT_NAME(symbol)[] = "%pS"; + /* Print type function for string type */ -int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) { int len = *(u32 *)data >> 16; if (!len) - trace_seq_printf(s, " %s=(fault)", name); + trace_seq_puts(s, "(fault)"); else - trace_seq_printf(s, " %s=\"%s\"", name, + trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); return !trace_seq_has_overflowed(s); } -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; -#define CHECK_FETCH_FUNCS(method, fn) \ - (((FETCH_FUNC_NAME(method, u8) == fn) || \ - (FETCH_FUNC_NAME(method, u16) == fn) || \ - (FETCH_FUNC_NAME(method, u32) == fn) || \ - (FETCH_FUNC_NAME(method, u64) == fn) || \ - (FETCH_FUNC_NAME(method, string) == fn) || \ - (FETCH_FUNC_NAME(method, string_size) == fn)) \ - && (fn != NULL)) - -/* Data fetch function templates */ -#define DEFINE_FETCH_reg(type) \ -void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_register(regs, \ - (unsigned int)((unsigned long)offset)); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); -DEFINE_BASIC_FETCH_FUNCS(reg) -/* No string on the register */ -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -#define DEFINE_FETCH_retval(type) \ -void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ -{ \ - *(type *)dest = (type)regs_return_value(regs); \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); -DEFINE_BASIC_FETCH_FUNCS(retval) -/* No string on the retval */ -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -/* Dereference memory access function */ -struct deref_fetch_param { - struct fetch_param orig; - long offset; - fetch_func_t fetch; - fetch_func_t fetch_size; +/* Fetch type information table */ +static const struct fetch_type probe_fetch_types[] = { + /* Special types */ + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, + "__data_loc char[]"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0), + + ASSIGN_FETCH_TYPE_END }; -#define DEFINE_FETCH_deref(type) \ -void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct deref_fetch_param *dprm = data; \ - unsigned long addr; \ - call_fetch(&dprm->orig, regs, &addr); \ - if (addr) { \ - addr += dprm->offset; \ - dprm->fetch(regs, (void *)addr, dest); \ - } else \ - *(type *)dest = 0; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); -DEFINE_BASIC_FETCH_FUNCS(deref) -DEFINE_FETCH_deref(string) - -void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - struct deref_fetch_param *dprm = data; - unsigned long addr; - - call_fetch(&dprm->orig, regs, &addr); - if (addr && dprm->fetch_size) { - addr += dprm->offset; - dprm->fetch_size(regs, (void *)addr, dest); - } else - *(string_size *)dest = 0; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); - -static void update_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} -NOKPROBE_SYMBOL(update_deref_fetch_param); - -static void free_deref_fetch_param(struct deref_fetch_param *data) -{ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - kfree(data); -} -NOKPROBE_SYMBOL(free_deref_fetch_param); - -/* Bitfield fetch function */ -struct bitfield_fetch_param { - struct fetch_param orig; - unsigned char hi_shift; - unsigned char low_shift; -}; - -#define DEFINE_FETCH_bitfield(type) \ -void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ -{ \ - struct bitfield_fetch_param *bprm = data; \ - type buf = 0; \ - call_fetch(&bprm->orig, regs, &buf); \ - if (buf) { \ - buf <<= bprm->hi_shift; \ - buf >>= bprm->low_shift; \ - } \ - *(type *)dest = buf; \ -} \ -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); -DEFINE_BASIC_FETCH_FUNCS(bitfield) -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -static void -update_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - update_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - update_symbol_cache(data->orig.data); -} - -static void -free_bitfield_fetch_param(struct bitfield_fetch_param *data) -{ - /* - * Don't check the bitfield itself, because this must be the - * last fetch function. - */ - if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) - free_deref_fetch_param(data->orig.data); - else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) - free_symbol_cache(data->orig.data); - - kfree(data); -} - -void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, - void *data, void *dest) -{ - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - long ret; - - if (!maxlen) - return; - - ret = strlcpy(dst, current->comm, maxlen); - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); - -void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, - void *data, void *dest) -{ - *(u32 *)dest = strlen(current->comm) + 1; -} -NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); - -static const struct fetch_type *find_fetch_type(const char *type, - const struct fetch_type *ftbl) +static const struct fetch_type *find_fetch_type(const char *type) { int i; @@ -255,58 +112,27 @@ static const struct fetch_type *find_fetch_type(const char *type, switch (bs) { case 8: - return find_fetch_type("u8", ftbl); + return find_fetch_type("u8"); case 16: - return find_fetch_type("u16", ftbl); + return find_fetch_type("u16"); case 32: - return find_fetch_type("u32", ftbl); + return find_fetch_type("u32"); case 64: - return find_fetch_type("u64", ftbl); + return find_fetch_type("u64"); default: goto fail; } } - for (i = 0; ftbl[i].name; i++) { - if (strcmp(type, ftbl[i].name) == 0) - return &ftbl[i]; + for (i = 0; probe_fetch_types[i].name; i++) { + if (strcmp(type, probe_fetch_types[i].name) == 0) + return &probe_fetch_types[i]; } fail: return NULL; } -/* Special function : only accept unsigned long */ -static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = kernel_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_kernel_stack_address); - -static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) -{ - *(unsigned long *)dest = user_stack_pointer(regs); -} -NOKPROBE_SYMBOL(fetch_user_stack_address); - -static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn, - const struct fetch_type *ftbl) -{ - int i; - - if (type != &ftbl[FETCH_TYPE_STRING]) - return NULL; /* Only string type needs size function */ - - for (i = 0; i < FETCH_MTD_END; i++) - if (type->fetch[i] == orig_fn) - return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; - - WARN_ON(1); /* This should not happen */ - - return NULL; -} - /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -331,41 +157,44 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, - bool is_kprobe) + struct fetch_insn *code, unsigned int flags) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) - f->fn = t->fetch[FETCH_MTD_retval]; + if (flags & TPARG_FL_RETURN) + code->op = FETCH_OP_RETVAL; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) - return -EINVAL; - - if (is_kprobe) - f->fn = fetch_kernel_stack_address; - else - f->fn = fetch_user_stack_address; + code->op = FETCH_OP_STACKP; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || (is_kprobe && param > PARAM_MAX_STACK)) + if (ret || ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK)) ret = -EINVAL; else { - f->fn = t->fetch[FETCH_MTD_stack]; - f->data = (void *)param; + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; } } else ret = -EINVAL; } else if (strcmp(arg, "comm") == 0) { - if (strcmp(t->name, "string") != 0 && - strcmp(t->name, "string_size") != 0) + code->op = FETCH_OP_COMM; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + } else if (((flags & TPARG_FL_MASK) == + (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && + strncmp(arg, "arg", 3) == 0) { + if (!isdigit(arg[3])) + return -EINVAL; + ret = kstrtoul(arg + 3, 10, ¶m); + if (ret || !param || param > PARAM_MAX_STACK) return -EINVAL; - f->fn = t->fetch[FETCH_MTD_comm]; + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param - 1; +#endif } else ret = -EINVAL; @@ -373,25 +202,27 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } /* Recursive argument parser */ -static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) +static int +parse_probe_arg(char *arg, const struct fetch_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + unsigned int flags) { + struct fetch_insn *code = *pcode; unsigned long param; - long offset; + long offset = 0; char *tmp; int ret = 0; switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); + ret = parse_probe_vars(arg + 1, type, code, flags); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->fetch[FETCH_MTD_reg]; - f->data = (void *)(unsigned long)ret; + code->op = FETCH_OP_REG; + code->param = (unsigned int)ret; ret = 0; } break; @@ -401,33 +232,42 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, ret = kstrtoul(arg + 1, 0, ¶m); if (ret) break; - - f->fn = t->fetch[FETCH_MTD_memory]; - f->data = (void *)param; + /* load address */ + code->op = FETCH_OP_IMM; + code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (is_kprobe) + if (flags & TPARG_FL_KERNEL) return -EINVAL; ret = kstrtol(arg + 2, 0, &offset); if (ret) break; - f->fn = t->fetch[FETCH_MTD_file_offset]; - f->data = (void *)offset; + code->op = FETCH_OP_FOFFS; + code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!is_kprobe) + if (!(flags & TPARG_FL_KERNEL)) return -EINVAL; - ret = traceprobe_split_symbol_offset(arg + 1, &offset); - if (ret) - break; + /* Preserve symbol for updating */ + code->op = FETCH_NOP_SYMBOL; + code->data = kstrdup(arg + 1, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + if (++code == end) + return -E2BIG; - f->data = alloc_symbol_cache(arg + 1, offset); - if (f->data) - f->fn = t->fetch[FETCH_MTD_symbol]; + code->op = FETCH_OP_IMM; + code->immediate = 0; } + /* These are fetching from memory */ + if (++code == end) + return -E2BIG; + *pcode = code; + code->op = FETCH_OP_DEREF; + code->offset = offset; break; case '+': /* deref memory */ @@ -435,11 +275,10 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, case '-': tmp = strchr(arg, '('); if (!tmp) - break; + return -EINVAL; *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) break; @@ -447,36 +286,27 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, tmp = strrchr(arg, ')'); if (tmp) { - struct deref_fetch_param *dprm; - const struct fetch_type *t2; + const struct fetch_type *t2 = find_fetch_type(NULL); - t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; - dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); - - if (!dprm) - return -ENOMEM; - - dprm->offset = offset; - dprm->fetch = t->fetch[FETCH_MTD_memory]; - dprm->fetch_size = get_fetch_size_function(t, - dprm->fetch, ftbl); - ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe, ftbl); + ret = parse_probe_arg(arg, t2, &code, end, flags); if (ret) - kfree(dprm); - else { - f->fn = t->fetch[FETCH_MTD_deref]; - f->data = (void *)dprm; - } + break; + if (code->op == FETCH_OP_COMM) + return -EINVAL; + if (++code == end) + return -E2BIG; + *pcode = code; + + code->op = FETCH_OP_DEREF; + code->offset = offset; } break; } - if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ - pr_info("%s type has no corresponding fetch method.\n", t->name); + if (!ret && code->op == FETCH_OP_NOP) { + /* Parsed, but do not find fetch method */ ret = -EINVAL; } - return ret; } @@ -485,22 +315,15 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, - struct fetch_param *f) + struct fetch_insn **pcode) { - struct bitfield_fetch_param *bprm; + struct fetch_insn *code = *pcode; unsigned long bw, bo; char *tail; if (*bf != 'b') return 0; - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - return -ENOMEM; - - bprm->orig = *f; - f->fn = t->fetch[FETCH_MTD_bitfield]; - f->data = (void *)bprm; bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ if (bw == 0 || *tail != '@') @@ -511,20 +334,26 @@ static int __parse_bitfield_probe_arg(const char *bf, if (tail == bf || *tail != '/') return -EINVAL; + code++; + if (code->op != FETCH_OP_NOP) + return -E2BIG; + *pcode = code; - bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); - bprm->low_shift = bprm->hi_shift + bo; + code->op = FETCH_OP_MOD_BF; + code->lshift = BYTES_TO_BITS(t->size) - (bw + bo); + code->rshift = BYTES_TO_BITS(t->size) - bw; + code->basesize = t->size; return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; } /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl) + struct probe_arg *parg, unsigned int flags) { - const char *t; - int ret; + struct fetch_insn *code, *scode, *tmp = NULL; + char *t, *t2; + int ret, len; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -535,36 +364,128 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, pr_info("Failed to allocate memory for command '%s'.\n", arg); return -ENOMEM; } - t = strchr(parg->comm, ':'); + t = strchr(arg, ':'); if (t) { - arg[t - parg->comm] = '\0'; - t++; + *t = '\0'; + t2 = strchr(++t, '['); + if (t2) { + *t2 = '\0'; + parg->count = simple_strtoul(t2 + 1, &t2, 0); + if (strcmp(t2, "]") || parg->count == 0) + return -EINVAL; + if (parg->count > MAX_ARRAY_LEN) + return -E2BIG; + } } /* * The default type of $comm should be "string", and it can't be * dereferenced. */ if (!t && strcmp(arg, "$comm") == 0) - t = "string"; - parg->type = find_fetch_type(t, ftbl); + parg->type = find_fetch_type("string"); + else + parg->type = find_fetch_type(t); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; } parg->offset = *size; - *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, - is_kprobe, ftbl); - - if (ret >= 0 && t != NULL) - ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); - - if (ret >= 0) { - parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn, - ftbl); - parg->fetch_size.data = parg->fetch.data; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) + return -ENOMEM; + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } + + code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + if (!code) + return -ENOMEM; + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + flags); + if (ret) + goto fail; + + /* Store operation */ + if (!strcmp(parg->type->name, "string")) { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && + code->op != FETCH_OP_COMM) { + pr_info("string only accepts memory or address.\n"); + ret = -EINVAL; + goto fail; + } + if (code->op != FETCH_OP_DEREF || parg->count) { + /* + * IMM and COMM is pointing actual address, those must + * be kept, and if parg->count != 0, this is an array + * of string pointers instead of string address itself. + */ + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + } + code->op = FETCH_OP_ST_STRING; /* In DEREF case, replace it */ + code->size = parg->type->size; + parg->dynamic = true; + } else if (code->op == FETCH_OP_DEREF) { + code->op = FETCH_OP_ST_MEM; + code->size = parg->type->size; + } else { + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_ST_RAW; + code->size = parg->type->size; + } + scode = code; + /* Modify operation */ + if (t != NULL) { + ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (ret) + goto fail; } + /* Loop(Array) operation */ + if (parg->count) { + if (scode->op != FETCH_OP_ST_MEM && + scode->op != FETCH_OP_ST_STRING) { + pr_info("array only accepts memory or address\n"); + ret = -EINVAL; + goto fail; + } + code++; + if (code->op != FETCH_OP_NOP) { + ret = -E2BIG; + goto fail; + } + code->op = FETCH_OP_LP_ARRAY; + code->param = parg->count; + } + code++; + code->op = FETCH_OP_END; + + /* Shrink down the code buffer */ + parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + if (!parg->code) + ret = -ENOMEM; + else + memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); + +fail: + if (ret) { + for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + } + kfree(tmp); return ret; } @@ -586,35 +507,63 @@ int traceprobe_conflict_field_name(const char *name, return 0; } -void traceprobe_update_arg(struct probe_arg *arg) -{ - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - update_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - update_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - update_symbol_cache(arg->fetch.data); -} - void traceprobe_free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) - free_bitfield_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) - free_deref_fetch_param(arg->fetch.data); - else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) - free_symbol_cache(arg->fetch.data); + struct fetch_insn *code = arg->code; + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) + kfree(code->data); + code++; + } + kfree(arg->code); kfree(arg->name); kfree(arg->comm); + kfree(arg->fmt); } +int traceprobe_update_arg(struct probe_arg *arg) +{ + struct fetch_insn *code = arg->code; + long offset; + char *tmp; + char c; + int ret = 0; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) { + if (code[1].op != FETCH_OP_IMM) + return -EINVAL; + + tmp = strpbrk("+-", code->data); + if (tmp) + c = *tmp; + ret = traceprobe_split_symbol_offset(code->data, + &offset); + if (ret) + return ret; + + code[1].immediate = + (unsigned long)kallsyms_lookup_name(code->data); + if (tmp) + *tmp = c; + if (!code[1].immediate) + return -ENOENT; + code[1].immediate += offset; + } + code++; + } + return 0; +} + +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, bool is_return) { - int i; + struct probe_arg *parg; + int i, j; int pos = 0; - const char *fmt, *arg; if (!is_return) { @@ -625,35 +574,51 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; } - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); + parg = tp->args + i; + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=", parg->name); + if (parg->count) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "{%s", + parg->type->fmt); + for (j = 1; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, ",%s", + parg->type->fmt); + pos += snprintf(buf + pos, LEN_OR_ZERO, "}"); + } else + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", + parg->type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) + parg = tp->args + i; + if (parg->count) { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s[%d])"; + else + fmt = ", REC->%s[%d]"; + for (j = 0; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, + fmt, parg->name, j); + } else { + if (strcmp(parg->type->name, "string") == 0) + fmt = ", __get_str(%s)"; + else + fmt = ", REC->%s"; pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + fmt, parg->name); + } } -#undef LEN_OR_ZERO - /* return the length of print_fmt */ return pos; } +#undef LEN_OR_ZERO -int set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) { int len; char *print_fmt; @@ -670,3 +635,28 @@ int set_print_fmt(struct trace_probe *tp, bool is_return) return 0; } + +int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp) +{ + int ret, i; + + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + struct probe_arg *parg = &tp->args[i]; + const char *fmt = parg->type->fmttype; + int size = parg->type->size; + + if (parg->fmt) + fmt = parg->fmt; + if (parg->count) + size *= parg->count; + ret = trace_define_field(event_call, fmt, parg->name, + offset + parg->offset, size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5f52668e165d..974afc1a3e73 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -23,6 +23,7 @@ #include <linux/stringify.h> #include <linux/limits.h> #include <linux/uaccess.h> +#include <linux/bitops.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -30,6 +31,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 +#define MAX_ARRAY_LEN 64 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -54,50 +56,74 @@ #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 +/* data_loc: data location, compatible with u32 */ +#define make_data_loc(len, offs) \ + (((u32)(len) << 16) | ((u32)(offs) & 0xffff)) +#define get_loc_len(dl) ((u32)(dl) >> 16) +#define get_loc_offs(dl) ((u32)(dl) & 0xffff) -/* data_rloc: data relative location, compatible with u32 */ -#define make_data_rloc(len, roffs) \ - (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) -#define get_rloc_len(dl) ((u32)(dl) >> 16) -#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) - -/* - * Convert data_rloc to data_loc: - * data_rloc stores the offset from data_rloc itself, but data_loc - * stores the offset from event entry. - */ -#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) - -static nokprobe_inline void *get_rloc_data(u32 *dl) +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { - return (u8 *)dl + get_rloc_offs(*dl); + return (u8 *)ent + get_loc_offs(*dl); } -/* For data_loc conversion */ -static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline u32 update_data_loc(u32 loc, int consumed) { - return (u8 *)ent + get_rloc_offs(*dl); + u32 maxlen = get_loc_len(loc); + u32 offset = get_loc_offs(loc); + + return make_data_loc(maxlen - consumed, offset + consumed); } -/* Data fetch function type */ -typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *); - -/* Fetch types */ -enum { - FETCH_MTD_reg = 0, - FETCH_MTD_stack, - FETCH_MTD_retval, - FETCH_MTD_comm, - FETCH_MTD_memory, - FETCH_MTD_symbol, - FETCH_MTD_deref, - FETCH_MTD_bitfield, - FETCH_MTD_file_offset, - FETCH_MTD_END, +typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); + +enum fetch_op { + FETCH_OP_NOP = 0, + // Stage 1 (load) ops + FETCH_OP_REG, /* Register : .param = offset */ + FETCH_OP_STACK, /* Stack : .param = index */ + FETCH_OP_STACKP, /* Stack pointer */ + FETCH_OP_RETVAL, /* Return value */ + FETCH_OP_IMM, /* Immediate : .immediate */ + FETCH_OP_COMM, /* Current comm */ + FETCH_OP_ARG, /* Function argument : .param */ + FETCH_OP_FOFFS, /* File offset: .immediate */ + // Stage 2 (dereference) op + FETCH_OP_DEREF, /* Dereference: .offset */ + // Stage 3 (store) ops + FETCH_OP_ST_RAW, /* Raw: .size */ + FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_STRING, /* String: .offset, .size */ + // Stage 4 (modify) op + FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ + // Stage 5 (loop) op + FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_END, + FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; +struct fetch_insn { + enum fetch_op op; + union { + unsigned int param; + struct { + unsigned int size; + int offset; + }; + struct { + unsigned char basesize; + unsigned char lshift; + unsigned char rshift; + }; + unsigned long immediate; + void *data; + }; +}; + +/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ +#define FETCH_INSN_MAX 16 + /* Fetch type information table */ struct fetch_type { const char *name; /* Name of type */ @@ -106,13 +132,6 @@ struct fetch_type { print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ const char *fmttype; /* Name in format file */ - /* Fetch functions */ - fetch_func_t fetch[FETCH_MTD_END]; -}; - -struct fetch_param { - fetch_func_t fn; - void *data; }; /* For defining macros, define string/string_size types */ @@ -124,8 +143,7 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -142,57 +160,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x32); DECLARE_BASIC_PRINT_TYPE_FUNC(x64); DECLARE_BASIC_PRINT_TYPE_FUNC(string); - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type - -/* Declare macro for basic types */ -#define DECLARE_FETCH_FUNC(method, type) \ -extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ - void *data, void *dest) - -#define DECLARE_BASIC_FETCH_FUNCS(method) \ -DECLARE_FETCH_FUNC(method, u8); \ -DECLARE_FETCH_FUNC(method, u16); \ -DECLARE_FETCH_FUNC(method, u32); \ -DECLARE_FETCH_FUNC(method, u64) - -DECLARE_BASIC_FETCH_FUNCS(reg); -#define fetch_reg_string NULL -#define fetch_reg_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(retval); -#define fetch_retval_string NULL -#define fetch_retval_string_size NULL - -DECLARE_BASIC_FETCH_FUNCS(symbol); -DECLARE_FETCH_FUNC(symbol, string); -DECLARE_FETCH_FUNC(symbol, string_size); - -DECLARE_BASIC_FETCH_FUNCS(deref); -DECLARE_FETCH_FUNC(deref, string); -DECLARE_FETCH_FUNC(deref, string_size); - -DECLARE_BASIC_FETCH_FUNCS(bitfield); -#define fetch_bitfield_string NULL -#define fetch_bitfield_string_size NULL - -/* comm only makes sense as a string */ -#define fetch_comm_u8 NULL -#define fetch_comm_u16 NULL -#define fetch_comm_u32 NULL -#define fetch_comm_u64 NULL -DECLARE_FETCH_FUNC(comm, string); -DECLARE_FETCH_FUNC(comm, string_size); - -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) +DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) x##t @@ -200,8 +168,9 @@ DEFINE_FETCH_##method(u64) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) +#define __ADDR_FETCH_TYPE(t) u##t +#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) +#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ {.name = _name, \ @@ -210,64 +179,23 @@ DEFINE_FETCH_##method(u64) .print = PRINT_TYPE_FUNC_NAME(ptype), \ .fmt = PRINT_TYPE_FMT_NAME(ptype), \ .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(comm, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ -ASSIGN_FETCH_FUNC(file_offset, ftype), \ - } \ } - +#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, #_fmttype) #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) /* If ptype is an alias of atype, use this macro (show atype in format) */ #define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype) #define ASSIGN_FETCH_TYPE_END {} - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 +#define MAX_ARRAY_LEN 64 #ifdef CONFIG_KPROBE_EVENTS -struct symbol_cache; -unsigned long update_symbol_cache(struct symbol_cache *sc); -void free_symbol_cache(struct symbol_cache *sc); -struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); bool trace_kprobe_on_func_entry(struct trace_event_call *call); bool trace_kprobe_error_injectable(struct trace_event_call *call); #else -/* uprobes do not support symbol fetch methods */ -#define fetch_symbol_u8 NULL -#define fetch_symbol_u16 NULL -#define fetch_symbol_u32 NULL -#define fetch_symbol_u64 NULL -#define fetch_symbol_string NULL -#define fetch_symbol_string_size NULL - -struct symbol_cache { -}; -static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) -{ - return 0; -} - -static inline void __used free_symbol_cache(struct symbol_cache *sc) -{ -} - -static inline struct symbol_cache * __used -alloc_symbol_cache(const char *sym, long offset) -{ - return NULL; -} - static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { return false; @@ -280,11 +208,13 @@ static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { - struct fetch_param fetch; - struct fetch_param fetch_size; + struct fetch_insn *code; + bool dynamic;/* Dynamic array (string) is used */ unsigned int offset; /* Offset from argument entry */ + unsigned int count; /* Array count */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ + char *fmt; /* Format string if needed */ const struct fetch_type *type; /* Type of this argument */ }; @@ -313,12 +243,6 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static nokprobe_inline void call_fetch(struct fetch_param *fprm, - struct pt_regs *regs, void *dest) -{ - return fprm->fn(regs, fprm->data, dest); -} - /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { @@ -343,67 +267,23 @@ find_event_file_link(struct trace_probe *tp, struct trace_event_file *file) return NULL; } +#define TPARG_FL_RETURN BIT(0) +#define TPARG_FL_KERNEL BIT(1) +#define TPARG_FL_FENTRY BIT(2) +#define TPARG_FL_MASK GENMASK(2, 0) + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe, - const struct fetch_type *ftbl); + struct probe_arg *parg, unsigned int flags); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); -extern void traceprobe_update_arg(struct probe_arg *arg); +extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -/* Sum up total data length for dynamic arraies (strings) */ -static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static nokprobe_inline void -store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - -extern int set_print_fmt(struct trace_probe *tp, bool is_return); +extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * @@ -412,6 +292,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); extern struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return); extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif +extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp); diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h new file mode 100644 index 000000000000..5c56afc17cf8 --- /dev/null +++ b/kernel/trace/trace_probe_tmpl.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Traceprobe fetch helper inlines + */ + +static nokprobe_inline void +fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf) +{ + switch (code->size) { + case 1: + *(u8 *)buf = (u8)val; + break; + case 2: + *(u16 *)buf = (u16)val; + break; + case 4: + *(u32 *)buf = (u32)val; + break; + case 8: + //TBD: 32bit signed + *(u64 *)buf = (u64)val; + break; + default: + *(unsigned long *)buf = val; + } +} + +static nokprobe_inline void +fetch_apply_bitfield(struct fetch_insn *code, void *buf) +{ + switch (code->basesize) { + case 1: + *(u8 *)buf <<= code->lshift; + *(u8 *)buf >>= code->rshift; + break; + case 2: + *(u16 *)buf <<= code->lshift; + *(u16 *)buf >>= code->rshift; + break; + case 4: + *(u32 *)buf <<= code->lshift; + *(u32 *)buf >>= code->rshift; + break; + case 8: + *(u64 *)buf <<= code->lshift; + *(u64 *)buf >>= code->rshift; + break; + } +} + +/* + * These functions must be defined for each callsite. + * Return consumed dynamic data size (>= 0), or error (< 0). + * If dest is NULL, don't store result and return required dynamic data size. + */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, + void *dest, void *base); +static nokprobe_inline int fetch_store_strlen(unsigned long addr); +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size); + +/* From the 2nd stage, routine is same */ +static nokprobe_inline int +process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, + void *dest, void *base) +{ + struct fetch_insn *s3 = NULL; + int total = 0, ret = 0, i = 0; + u32 loc = 0; + unsigned long lval = val; + +stage2: + /* 2nd stage: dereference memory if needed */ + while (code->op == FETCH_OP_DEREF) { + lval = val; + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + if (ret) + return ret; + code++; + } + + s3 = code; +stage3: + /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + if (code->op == FETCH_OP_ST_STRING) { + ret += fetch_store_strlen(val + code->offset); + code++; + goto array; + } else + return -EILSEQ; + } + + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_mem_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + loc = *(u32 *)dest; + ret = fetch_store_string(val + code->offset, dest, base); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + +array: + /* the last stage: Loop on array */ + if (code->op == FETCH_OP_LP_ARRAY) { + total += ret; + if (++i < code->param) { + code = s3; + if (s3->op != FETCH_OP_ST_STRING) { + dest += s3->size; + val += s3->size; + goto stage3; + } + code--; + val = lval + sizeof(char *); + if (dest) { + dest += sizeof(u32); + *(u32 *)dest = update_data_loc(loc, ret); + } + goto stage2; + } + code++; + ret = total; + } + + return code->op == FETCH_OP_END ? ret : -EILSEQ; +} + +/* Sum up total data length for dynamic arraies (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + len = process_fetch_insn(arg->code, regs, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, + int header_size, int maxlen) +{ + struct probe_arg *arg; + void *base = data - header_size; + void *dyndata = data + tp->size; + u32 *dl; /* Data location */ + int ret, i; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + dl = data + arg->offset; + /* Point the dynamic data area if needed */ + if (unlikely(arg->dynamic)) + *dl = make_data_loc(maxlen, dyndata - base); + ret = process_fetch_insn(arg->code, regs, dl, base); + if (unlikely(ret < 0 && arg->dynamic)) + *dl = make_data_loc(0, dyndata - base); + else + dyndata += ret; + } +} + +static inline int +print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + void *p; + int i, j; + + for (i = 0; i < nr_args; i++) { + struct probe_arg *a = args + i; + + trace_seq_printf(s, " %s=", a->name); + if (likely(!a->count)) { + if (!a->type->print(s, data + a->offset, field)) + return -ENOMEM; + continue; + } + trace_seq_putc(s, '{'); + p = data + a->offset; + for (j = 0; j < a->count; j++) { + if (!a->type->print(s, p, field)) + return -ENOMEM; + trace_seq_putc(s, j == a->count - 1 ? '}' : ','); + p += a->type->size; + } + } + return 0; +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4237eba4ef20..2b0d1ee3241c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,7 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 3; + stack_trace_max.skip = 0; save_stack_trace(&stack_trace_max); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e696667da29a..31ea48eceda1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -15,6 +15,7 @@ #include <linux/rculist.h> #include "trace_probe.h" +#include "trace_probe_tmpl.h" #define UPROBE_EVENT_SYSTEM "uprobes" @@ -47,6 +48,7 @@ struct trace_uprobe { struct inode *inode; char *filename; unsigned long offset; + unsigned long ref_ctr_offset; unsigned long nhit; struct trace_probe tp; }; @@ -98,74 +100,52 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) /* * Uprobes-specific fetch functions */ -#define DEFINE_FETCH_stack(type) \ -static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)get_user_stack_nth(regs, \ - ((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - -#define DEFINE_FETCH_memory(type) \ -static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ - void *addr, void *dest) \ -{ \ - type retval; \ - void __user *vaddr = (void __force __user *) addr; \ - \ - if (copy_from_user(&retval, vaddr, sizeof(type))) \ - *(type *)dest = 0; \ - else \ - *(type *) dest = retval; \ +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + void __user *vaddr = (void __force __user *)src; + + return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; } -DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) { long ret; - u32 rloc = *(u32 *)dest; - int maxlen = get_rloc_len(rloc); - u8 *dst = get_rloc_data(dest); + u32 loc = *(u32 *)dest; + int maxlen = get_loc_len(loc); + u8 *dst = get_loc_data(dest, base); void __user *src = (void __force __user *) addr; - if (!maxlen) - return; + if (unlikely(!maxlen)) + return -ENOMEM; ret = strncpy_from_user(dst, src, maxlen); - if (ret == maxlen) - dst[--ret] = '\0'; - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); - } else { - *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + if (ret >= 0) { + if (ret == maxlen) + dst[ret - 1] = '\0'; + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); } + + return ret; } -static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) { int len; void __user *vaddr = (void __force __user *) addr; len = strnlen_user(vaddr, MAX_STRING_SIZE); - if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; + return (len > MAX_STRING_SIZE) ? 0 : len; } -static unsigned long translate_user_vaddr(void *file_offset) +static unsigned long translate_user_vaddr(unsigned long file_offset) { unsigned long base_addr; struct uprobe_dispatch_data *udd; @@ -173,44 +153,44 @@ static unsigned long translate_user_vaddr(void *file_offset) udd = (void *) current->utask->vaddr; base_addr = udd->bp_addr - udd->tu->offset; - return base_addr + (unsigned long)file_offset; + return base_addr + file_offset; } -#define DEFINE_FETCH_file_offset(type) \ -static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ - void *offset, void *dest)\ -{ \ - void *vaddr = (void *)translate_user_vaddr(offset); \ - \ - FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, + void *base) +{ + unsigned long val; + + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = get_user_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = user_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_FOFFS: + val = translate_user_vaddr(code->immediate); + break; + default: + return -EILSEQ; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); } -DEFINE_BASIC_FETCH_FUNCS(file_offset) -DEFINE_FETCH_file_offset(string) -DEFINE_FETCH_file_offset(string_size) - -/* Fetch type information table */ -static const struct fetch_type uprobes_fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), - ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), - ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), - ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), - ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), - - ASSIGN_FETCH_TYPE_END -}; +NOKPROBE_SYMBOL(process_fetch_insn) static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { @@ -311,6 +291,35 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) return 0; } +/* + * Uprobe with multiple reference counter is not allowed. i.e. + * If inode and offset matches, reference counter offset *must* + * match as well. Though, there is one exception: If user is + * replacing old trace_uprobe with new one(same group/event), + * then we allow same uprobe with new reference counter as far + * as the new one does not conflict with any other existing + * ones. + */ +static struct trace_uprobe *find_old_trace_uprobe(struct trace_uprobe *new) +{ + struct trace_uprobe *tmp, *old = NULL; + struct inode *new_inode = d_real_inode(new->path.dentry); + + old = find_probe_event(trace_event_name(&new->tp.call), + new->tp.call.class->system); + + list_for_each_entry(tmp, &uprobe_list, list) { + if ((old ? old != tmp : true) && + new_inode == d_real_inode(tmp->path.dentry) && + new->offset == tmp->offset && + new->ref_ctr_offset != tmp->ref_ctr_offset) { + pr_warn("Reference counter offset mismatch."); + return ERR_PTR(-EINVAL); + } + } + return old; +} + /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { @@ -320,8 +329,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(trace_event_name(&tu->tp.call), - tu->tp.call.class->system); + old_tu = find_old_trace_uprobe(tu); + if (IS_ERR(old_tu)) { + ret = PTR_ERR(old_tu); + goto end; + } + if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); @@ -352,10 +365,10 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - char *arg, *event, *group, *filename; + char *arg, *event, *group, *filename, *rctr, *rctr_end; char buf[MAX_EVENT_NAME_LEN]; struct path path; - unsigned long offset; + unsigned long offset, ref_ctr_offset; bool is_delete, is_return; int i, ret; @@ -364,6 +377,7 @@ static int create_trace_uprobe(int argc, char **argv) is_return = false; event = NULL; group = NULL; + ref_ctr_offset = 0; /* argc must be >= 1 */ if (argv[0][0] == '-') @@ -438,6 +452,26 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } + /* Parse reference counter offset if specified. */ + rctr = strchr(arg, '('); + if (rctr) { + rctr_end = strchr(rctr, ')'); + if (rctr > rctr_end || *(rctr_end + 1) != 0) { + ret = -EINVAL; + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + + *rctr++ = '\0'; + *rctr_end = '\0'; + ret = kstrtoul(rctr, 0, &ref_ctr_offset); + if (ret) { + pr_info("Invalid reference counter offset.\n"); + goto fail_address_parse; + } + } + + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) goto fail_address_parse; @@ -472,6 +506,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; + tu->ref_ctr_offset = ref_ctr_offset; tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); @@ -522,8 +557,7 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false, - uprobes_fetch_type_table); + is_return ? TPARG_FL_RETURN : 0); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -590,6 +624,9 @@ static int probes_seq_show(struct seq_file *m, void *v) trace_event_name(&tu->tp.call), tu->filename, (int)(sizeof(void *) * 2), tu->offset); + if (tu->ref_ctr_offset) + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); + for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -833,7 +870,6 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; - int i; entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, tp.call.event); @@ -850,12 +886,8 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto out; - } + if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) + goto out; trace_seq_putc(s, '\n'); @@ -905,7 +937,13 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, tu->consumer.filter = filter; tu->inode = d_real_inode(tu->path.dentry); - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (tu->ref_ctr_offset) { + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + } else { + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + } + if (ret) goto err_buffer; @@ -958,7 +996,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) static int uprobe_event_define_fields(struct trace_event_call *event_call) { - int ret, i, size; + int ret, size; struct uprobe_trace_entry_head field; struct trace_uprobe *tu = event_call->data; @@ -970,19 +1008,8 @@ static int uprobe_event_define_fields(struct trace_event_call *event_call) DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); size = SIZEOF_TRACE_ENTRY(false); } - /* Set argument names as fields */ - for (i = 0; i < tu->tp.nr_args; i++) { - struct probe_arg *parg = &tu->tp.args[i]; - - ret = trace_define_field(event_call, parg->type->fmttype, - parg->name, size + parg->offset, - parg->type->size, parg->type->is_signed, - FILTER_OTHER); - if (ret) - return ret; - } - return 0; + return traceprobe_define_arg_fields(event_call, size, &tu->tp); } #ifdef CONFIG_PERF_EVENTS @@ -1233,7 +1260,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) ret |= uprobe_trace_func(tu, regs, ucb, dsize); @@ -1268,7 +1295,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); if (tu->tp.flags & TP_FLAG_TRACE) uretprobe_trace_func(tu, func, regs, ucb, dsize); @@ -1304,7 +1331,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) init_trace_event_call(tu, call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_trace_event(&call->event); @@ -1340,7 +1367,8 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) #ifdef CONFIG_PERF_EVENTS struct trace_event_call * -create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return) { struct trace_uprobe *tu; struct path path; @@ -1372,10 +1400,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) tu->offset = offs; tu->path = path; + tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); - if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { ret = -ENOMEM; goto error; } |