From edf5c17d866eada03b8750368a12dc3def77d608 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Apr 2018 16:15:55 +0200 Subject: staging: irda: remove remaining remants of irda code removal There were some documentation locations that irda was mentioned, as well as an old MAINTAINERS entry and the networking sysctl entries. Clean these all out as this stuff really is finally gone. Reported-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl_binary.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8c0dab4fd65..07148b497451 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -704,24 +704,6 @@ static const struct bin_table bin_net_netfilter_table[] = { {} }; -static const struct bin_table bin_net_irda_table[] = { - { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, - { CTL_STR, NET_IRDA_DEVNAME, "devname" }, - { CTL_INT, NET_IRDA_DEBUG, "debug" }, - { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - static const struct bin_table bin_net_table[] = { { CTL_DIR, NET_CORE, "core", bin_net_core_table }, /* NET_ETHER not used */ @@ -743,7 +725,7 @@ static const struct bin_table bin_net_table[] = { { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, /* NET_DCCP "dccp" no longer used */ - { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + /* NET_IRDA "irda" no longer used */ { CTL_INT, 2089, "nf_conntrack_max" }, {} }; -- cgit v1.2.3-59-g8ed1b From be71eda5383faa663efdba9ef54a6b8255e3c7f0 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 18 Apr 2018 09:14:36 +0200 Subject: module: Fix display of wrong module .text address Reading file /proc/modules shows the correct address: [root@s35lp76 ~]# cat /proc/modules | egrep '^qeth_l2' qeth_l2 94208 1 - Live 0x000003ff80401000 and reading file /sys/module/qeth_l2/sections/.text [root@s35lp76 ~]# cat /sys/module/qeth_l2/sections/.text 0x0000000018ea8363 displays a random address. This breaks the perf tool which uses this address on s390 to calculate start of .text section in memory. Fix this by printing the correct (unhashed) address. Thanks to Jessica Yu for helping on this. Fixes: ef0010a30935 ("vsprintf: don't use 'restricted_pointer()' when not restricting") Cc: # v4.15+ Suggested-by: Linus Torvalds Signed-off-by: Thomas Richter Cc: Jessica Yu Signed-off-by: Jessica Yu --- kernel/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a6e43a5806a1..ce8066b88178 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1472,7 +1472,8 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%pK\n", (void *)sattr->address); + return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? + (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -- cgit v1.2.3-59-g8ed1b From ba16293dad626d3e3827cfe0a1a743c1d93e76b7 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Fri, 20 Apr 2018 20:37:58 +0530 Subject: tracing: Fix kernel crash while using empty filter with perf Kernel is crashing when user tries to record 'ftrace:function' event with empty filter: # perf record -e ftrace:function --filter="" ls # dmesg BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 Oops: 0000 [#1] SMP PTI ... RIP: 0010:ftrace_profile_set_filter+0x14b/0x2d0 RSP: 0018:ffffa4a7c0da7d20 EFLAGS: 00010246 RAX: ffffa4a7c0da7d64 RBX: 0000000000000000 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000000092 RDI: ffff8c48ffc968f0 ... Call Trace: _perf_ioctl+0x54a/0x6b0 ? rcu_all_qs+0x5/0x30 ... After patch: # perf record -e ftrace:function --filter="" ls failed to set filter "" on event ftrace:function with 22 (Invalid argument) Also, if user tries to echo "" > filter, it used to throw an error. This behavior got changed by commit 80765597bc58 ("tracing: Rewrite filter logic to be simpler and faster"). This patch restores the behavior as a side effect: Before patch: # echo "" > filter # After patch: # echo "" > filter bash: echo: write error: Invalid argument # Link: http://lkml.kernel.org/r/20180420150758.19787-1-ravi.bangoria@linux.ibm.com Fixes: 80765597bc58 ("tracing: Rewrite filter logic to be simpler and faster") Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9b4716bb8bb0..1f951b3df60c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1499,14 +1499,14 @@ static int process_preds(struct trace_event_call *call, return ret; } - if (!nr_preds) { - prog = NULL; - } else { - prog = predicate_parse(filter_string, nr_parens, nr_preds, + if (!nr_preds) + return -EINVAL; + + prog = predicate_parse(filter_string, nr_parens, nr_preds, parse_pred, call, pe); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (IS_ERR(prog)) + return PTR_ERR(prog); + rcu_assign_pointer(filter->prog, prog); return 0; } -- cgit v1.2.3-59-g8ed1b From bcbd385b61bbdef3491d662203ac2e8186e5be59 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 19 Apr 2018 12:55:56 +0200 Subject: kprobes: Fix random address output of blacklist file File /sys/kernel/debug/kprobes/blacklist displays random addresses: [root@s8360046 linux]# cat /sys/kernel/debug/kprobes/blacklist 0x0000000047149a90-0x00000000bfcb099a print_type_x8 .... This breaks 'perf probe' which uses the blacklist file to prohibit probes on certain functions by checking the address range. Fix this by printing the correct (unhashed) address. The file mode is read all but this is not an issue as the file hierarchy points out: # ls -ld /sys/ /sys/kernel/ /sys/kernel/debug/ /sys/kernel/debug/kprobes/ /sys/kernel/debug/kprobes/blacklist dr-xr-xr-x 12 root root 0 Apr 19 07:56 /sys/ drwxr-xr-x 8 root root 0 Apr 19 07:56 /sys/kernel/ drwx------ 16 root root 0 Apr 19 06:56 /sys/kernel/debug/ drwxr-xr-x 2 root root 0 Apr 19 06:56 /sys/kernel/debug/kprobes/ -r--r--r-- 1 root root 0 Apr 19 06:56 /sys/kernel/debug/kprobes/blacklist Everything in and below /sys/kernel/debug is rwx to root only, no group or others have access. Background: Directory /sys/kernel/debug/kprobes is created by debugfs_create_dir() which sets the mode bits to rwxr-xr-x. Maybe change that to use the parent's directory mode bits instead? Link: http://lkml.kernel.org/r/20180419105556.86664-1-tmricht@linux.ibm.com Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: stable@vger.kernel.org Cc: # v4.15+ Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S Miller Cc: Masami Hiramatsu Cc: acme@kernel.org Signed-off-by: Thomas Richter Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 102160ff5c66..ea619021d901 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2428,7 +2428,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, (void *)ent->end_addr, (void *)ent->start_addr); return 0; } -- cgit v1.2.3-59-g8ed1b From 9a0fd675304d410f3a9586e1b333e16f4658d56c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 15 Mar 2018 14:06:39 +0800 Subject: tracing: Fix missing tab for hwlat_detector print format It's been missing for a while but no one is touching that up. Fix it. Link: http://lkml.kernel.org/r/20180315060639.9578-1-peterx@redhat.com CC: Ingo Molnar Cc:stable@vger.kernel.org Fixes: 7b2c86250122d ("tracing: Add NMI tracing in hwlat detector") Signed-off-by: Peter Xu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_entries.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e954ae3d82c0..e3a658bac10f 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -356,7 +356,7 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, -- cgit v1.2.3-59-g8ed1b From 1f71addd34f4c442bec7d7c749acc1beb58126f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Apr 2018 21:22:18 +0200 Subject: tick/sched: Do not mess with an enqueued hrtimer Kaike reported that in tests rdma hrtimers occasionaly stopped working. He did great debugging, which provided enough context to decode the problem. CPU 3 CPU 2 idle start sched_timer expires = 712171000000 queue->next = sched_timer start rdmavt timer. expires = 712172915662 lock(baseof(CPU3)) tick_nohz_stop_tick() tick = 716767000000 timerqueue_add(tmr) hrtimer_set_expires(sched_timer, tick); sched_timer->expires = 716767000000 <---- FAIL if (tmr->expires < queue->next->expires) hrtimer_start(sched_timer) queue->next = tmr; lock(baseof(CPU3)) unlock(baseof(CPU3)) timerqueue_remove() timerqueue_add() ts->sched_timer is queued and queue->next is pointing to it, but then ts->sched_timer.expires is modified. This not only corrupts the ordering of the timerqueue RB tree, it also makes CPU2 see the new expiry time of timerqueue->next->expires when checking whether timerqueue->next needs to be updated. So CPU2 sees that the rdma timer is earlier than timerqueue->next and sets the rdma timer as new next. Depending on whether it had also seen the new time at RB tree enqueue, it might have queued the rdma timer at the wrong place and then after removing the sched_timer the RB tree is completely hosed. The problem was introduced with a commit which tried to solve inconsistency between the hrtimer in the tick_sched data and the underlying hardware clockevent. It split out hrtimer_set_expires() to store the new tick time in both the NOHZ and the NOHZ + HIGHRES case, but missed the fact that in the NOHZ + HIGHRES case the hrtimer might still be queued. Use hrtimer_start(timer, tick...) for the NOHZ + HIGHRES case which sets timer->expires after canceling the timer and move the hrtimer_set_expires() invocation into the NOHZ only code path which is not affected as it merily uses the hrtimer as next event storage so code pathes can be shared with the NOHZ + HIGHRES case. Fixes: d4af6d933ccf ("nohz: Fix spurious warning when hrtimer and clockevent get out of sync") Reported-by: "Wan Kaike" Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Cc: "Marciniszyn Mike" Cc: Anna-Maria Gleixner Cc: linux-rdma@vger.kernel.org Cc: "Dalessandro Dennis" Cc: "Fleck John" Cc: stable@vger.kernel.org Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: "Weiny Ira" Cc: "linux-rdma@vger.kernel.org" Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804241637390.1679@nanos.tec.linutronix.de Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804242119210.1597@nanos.tec.linutronix.de --- kernel/time/tick-sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 646645e981f9..d31bec177fa5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -804,12 +804,12 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) return; } - hrtimer_set_expires(&ts->sched_timer, tick); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - else + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + } else { + hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); + } } static void tick_nohz_retain_tick(struct tick_sched *ts) -- cgit v1.2.3-59-g8ed1b From a3ed0e4393d6885b4af7ce84b437dc696490a530 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Apr 2018 15:33:38 +0200 Subject: Revert: Unify CLOCK_MONOTONIC and CLOCK_BOOTTIME Revert commits 92af4dcb4e1c ("tracing: Unify the "boot" and "mono" tracing clocks") 127bfa5f4342 ("hrtimer: Unify MONOTONIC and BOOTTIME clock behavior") 7250a4047aa6 ("posix-timers: Unify MONOTONIC and BOOTTIME clock behavior") d6c7270e913d ("timekeeping: Remove boot time specific code") f2d6fdbfd238 ("Input: Evdev - unify MONOTONIC and BOOTTIME clock behavior") d6ed449afdb3 ("timekeeping: Make the MONOTONIC clock behave like the BOOTTIME clock") 72199320d49d ("timekeeping: Add the new CLOCK_MONOTONIC_ACTIVE clock") As stated in the pull request for the unification of CLOCK_MONOTONIC and CLOCK_BOOTTIME, it was clear that we might have to revert the change. As reported by several folks systemd and other applications rely on the documented behaviour of CLOCK_MONOTONIC on Linux and break with the above changes. After resume daemons time out and other timeout related issues are observed. Rafael compiled this list: * systemd kills daemons on resume, after >WatchdogSec seconds of suspending (Genki Sky). [Verified that that's because systemd uses CLOCK_MONOTONIC and expects it to not include the suspend time.] * systemd-journald misbehaves after resume: systemd-journald[7266]: File /var/log/journal/016627c3c4784cd4812d4b7e96a34226/system.journal corrupted or uncleanly shut down, renaming and replacing. (Mike Galbraith). * NetworkManager reports "networking disabled" and networking is broken after resume 50% of the time (Pavel). [May be because of systemd.] * MATE desktop dims the display and starts the screensaver right after system resume (Pavel). * Full system hang during resume (me). [May be due to systemd or NM or both.] That happens on debian and open suse systems. It's sad, that these problems were neither catched in -next nor by those folks who expressed interest in this change. Reported-by: Rafael J. Wysocki Reported-by: Genki Sky , Reported-by: Pavel Machek Signed-off-by: Thomas Gleixner Cc: Dmitry Torokhov Cc: John Stultz Cc: Jonathan Corbet Cc: Kevin Easton Cc: Linus Torvalds Cc: Mark Salyzyn Cc: Michael Kerrisk Cc: Peter Zijlstra Cc: Petr Mladek Cc: Prarit Bhargava Cc: Sergey Senozhatsky Cc: Steven Rostedt --- Documentation/trace/ftrace.rst | 14 +++++-- drivers/input/evdev.c | 7 +++- include/linux/hrtimer.h | 2 + include/linux/timekeeper_internal.h | 2 - include/linux/timekeeping.h | 37 ++++++++++++------ include/uapi/linux/time.h | 1 - kernel/time/hrtimer.c | 16 +++++++- kernel/time/posix-stubs.c | 2 - kernel/time/posix-timers.c | 26 ++++++++----- kernel/time/tick-common.c | 15 ------- kernel/time/tick-internal.h | 6 --- kernel/time/tick-sched.c | 9 ----- kernel/time/timekeeping.c | 78 ++++++++++++++++++------------------- kernel/time/timekeeping.h | 1 + kernel/trace/trace.c | 2 +- 15 files changed, 114 insertions(+), 104 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index e45f0786f3f9..67d9c38e95eb 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -461,9 +461,17 @@ of ftrace. Here is a list of some of the key files: and ticks at the same rate as the hardware clocksource. boot: - Same as mono. Used to be a separate clock which accounted - for the time spent in suspend while CLOCK_MONOTONIC did - not. + This is the boot clock (CLOCK_BOOTTIME) and is based on the + fast monotonic clock, but also accounts for time spent in + suspend. Since the clock access is designed for use in + tracing in the suspend path, some side effects are possible + if clock is accessed after the suspend time is accounted before + the fast mono clock is updated. In this case, the clock update + appears to happen slightly sooner than it normally would have. + Also on 32-bit systems, it's possible that the 64-bit boot offset + sees a partial update. These effects are rare and post + processing should be able to handle them. See comments in the + ktime_get_boot_fast_ns() function for more information. To set a clock, simply echo the clock name into this file:: diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 46115a392098..c81c79d01d93 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -31,6 +31,7 @@ enum evdev_clock_type { EV_CLK_REAL = 0, EV_CLK_MONO, + EV_CLK_BOOT, EV_CLK_MAX }; @@ -197,10 +198,12 @@ static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid) case CLOCK_REALTIME: clk_type = EV_CLK_REAL; break; - case CLOCK_BOOTTIME: case CLOCK_MONOTONIC: clk_type = EV_CLK_MONO; break; + case CLOCK_BOOTTIME: + clk_type = EV_CLK_BOOT; + break; default: return -EINVAL; } @@ -311,6 +314,8 @@ static void evdev_events(struct input_handle *handle, ev_time[EV_CLK_MONO] = ktime_get(); ev_time[EV_CLK_REAL] = ktime_mono_to_real(ev_time[EV_CLK_MONO]); + ev_time[EV_CLK_BOOT] = ktime_mono_to_any(ev_time[EV_CLK_MONO], + TK_OFFS_BOOT); rcu_read_lock(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a2656c3ebe81..3892e9c8b2de 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -161,9 +161,11 @@ struct hrtimer_clock_base { enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, + HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, HRTIMER_BASE_MONOTONIC_SOFT, HRTIMER_BASE_REALTIME_SOFT, + HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 4b3dca173e89..7acb953298a7 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -52,7 +52,6 @@ struct tk_read_base { * @offs_real: Offset clock monotonic -> clock realtime * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai - * @time_suspended: Accumulated suspend time * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events @@ -95,7 +94,6 @@ struct timekeeper { ktime_t offs_real; ktime_t offs_boot; ktime_t offs_tai; - ktime_t time_suspended; s32 tai_offset; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 9737fbec7019..588a0e4b1ab9 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -33,25 +33,20 @@ extern void ktime_get_ts64(struct timespec64 *ts); extern time64_t ktime_get_seconds(void); extern time64_t __ktime_get_real_seconds(void); extern time64_t ktime_get_real_seconds(void); -extern void ktime_get_active_ts64(struct timespec64 *ts); extern int __getnstimeofday64(struct timespec64 *tv); extern void getnstimeofday64(struct timespec64 *tv); extern void getboottime64(struct timespec64 *ts); -#define ktime_get_real_ts64(ts) getnstimeofday64(ts) - -/* Clock BOOTTIME compatibility wrappers */ -static inline void get_monotonic_boottime64(struct timespec64 *ts) -{ - ktime_get_ts64(ts); -} +#define ktime_get_real_ts64(ts) getnstimeofday64(ts) /* * ktime_t based interfaces */ + enum tk_offsets { TK_OFFS_REAL, + TK_OFFS_BOOT, TK_OFFS_TAI, TK_OFFS_MAX, }; @@ -62,10 +57,6 @@ extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); extern u32 ktime_get_resolution_ns(void); -/* Clock BOOTTIME compatibility wrappers */ -static inline ktime_t ktime_get_boottime(void) { return ktime_get(); } -static inline u64 ktime_get_boot_ns(void) { return ktime_get(); } - /** * ktime_get_real - get the real (wall-) time in ktime_t format */ @@ -74,6 +65,17 @@ static inline ktime_t ktime_get_real(void) return ktime_get_with_offset(TK_OFFS_REAL); } +/** + * ktime_get_boottime - Returns monotonic time since boot in ktime_t format + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the + * time spent in suspend. + */ +static inline ktime_t ktime_get_boottime(void) +{ + return ktime_get_with_offset(TK_OFFS_BOOT); +} + /** * ktime_get_clocktai - Returns the TAI time of day in ktime_t format */ @@ -100,6 +102,11 @@ static inline u64 ktime_get_real_ns(void) return ktime_to_ns(ktime_get_real()); } +static inline u64 ktime_get_boot_ns(void) +{ + return ktime_to_ns(ktime_get_boottime()); +} + static inline u64 ktime_get_tai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); @@ -112,11 +119,17 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); +extern u64 ktime_get_boot_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); /* * timespec64 interfaces utilizing the ktime based ones */ +static inline void get_monotonic_boottime64(struct timespec64 *ts) +{ + *ts = ktime_to_timespec64(ktime_get_boottime()); +} + static inline void timekeeping_clocktai64(struct timespec64 *ts) { *ts = ktime_to_timespec64(ktime_get_clocktai()); diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 16a296612ba4..4c0338ea308a 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -73,7 +73,6 @@ struct __kernel_old_timeval { */ #define CLOCK_SGI_CYCLE 10 #define CLOCK_TAI 11 -#define CLOCK_MONOTONIC_ACTIVE 12 #define MAX_CLOCKS 16 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index eda1210ce50f..14e858753d76 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,6 +90,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, + { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, @@ -105,6 +110,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, @@ -119,7 +129,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -571,12 +581,14 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_tai); + offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index e0dbae98db9d..69a937c3cd81 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -83,8 +83,6 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) case CLOCK_BOOTTIME: get_monotonic_boottime64(tp); break; - case CLOCK_MONOTONIC_ACTIVE: - ktime_get_active_ts64(tp); default: return -EINVAL; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b6899b5060bd..10b7186d0638 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -252,16 +252,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai64(tp); + get_monotonic_boottime64(tp); return 0; } -static int posix_get_monotonic_active(clockid_t which_clock, - struct timespec64 *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_active_ts64(tp); + timekeeping_clocktai64(tp); return 0; } @@ -1317,9 +1316,19 @@ static const struct k_clock clock_tai = { .timer_arm = common_hrtimer_arm, }; -static const struct k_clock clock_monotonic_active = { +static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_active, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { @@ -1330,11 +1339,10 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_monotonic, + [CLOCK_BOOTTIME] = &clock_boottime, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, - [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 099572ca4a8f..49edc1c4f3e6 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -419,19 +419,6 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } -static void tick_forward_next_period(void) -{ - ktime_t delta, now = ktime_get(); - u64 n; - - delta = ktime_sub(now, tick_next_period); - n = ktime_divns(delta, tick_period); - tick_next_period += n * tick_period; - if (tick_next_period < now) - tick_next_period += tick_period; - tick_sched_forward_next_period(); -} - /** * tick_resume_local - Resume the local tick device * @@ -444,8 +431,6 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); - tick_forward_next_period(); - clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 21efab7485ca..e277284c2831 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,12 +141,6 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -extern void tick_sched_forward_next_period(void); -#else -static inline void tick_sched_forward_next_period(void) { } -#endif - /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d31bec177fa5..da9455a6b42b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -51,15 +51,6 @@ struct tick_sched *tick_get_tick_sched(int cpu) */ static ktime_t last_jiffies_update; -/* - * Called after resume. Make sure that jiffies are not fast forwarded due to - * clock monotonic being forwarded by the suspended time. - */ -void tick_sched_forward_next_period(void) -{ - last_jiffies_update = tick_next_period; -} - /* * Must be called with interrupts disabled ! */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dcf7f20fcd12..49cbceef5deb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,12 +138,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - /* Update both bases so mono and raw stay coupled. */ - tk->tkr_mono.base += delta; - tk->tkr_raw.base += delta; - - /* Accumulate time spent in suspend */ - tk->time_suspended += delta; + tk->offs_boot = ktime_add(tk->offs_boot, delta); } /* @@ -473,6 +468,36 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + + /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ @@ -764,6 +789,7 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; @@ -860,39 +886,6 @@ void ktime_get_ts64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts64); -/** - * ktime_get_active_ts64 - Get the active non-suspended monotonic clock - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime clock and - * the wall_to_monotonic offset, subtracts the accumulated suspend time and - * stores the result in normalized timespec64 format in the variable - * pointed to by @ts. - */ -void ktime_get_active_ts64(struct timespec64 *ts) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 tomono, tsusp; - u64 nsec, nssusp; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(&tk->tkr_mono); - tomono = tk->wall_to_monotonic; - nssusp = tk->time_suspended; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - ts->tv_sec += tomono.tv_sec; - ts->tv_nsec = 0; - timespec64_add_ns(ts, nsec + tomono.tv_nsec); - tsusp = ns_to_timespec64(nssusp); - *ts = timespec64_sub(*ts, tsusp); -} - /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * @@ -1593,6 +1586,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, return; } tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -2125,7 +2119,7 @@ out: void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended); + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } @@ -2188,6 +2182,7 @@ void do_timer(unsigned long ticks) * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the @@ -2197,7 +2192,7 @@ void do_timer(unsigned long ticks) * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_tai) + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -2214,6 +2209,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 79b67f5e0343..7a9b4eb7a1d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -6,6 +6,7 @@ */ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dfbcf9ee1447..414d7210b2ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1165,7 +1165,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, - { ktime_get_mono_fast_ns, "boot", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3-59-g8ed1b From 0c92c7a3c5d416f47b32c5f20a611dfeca5d5f2e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Apr 2018 10:21:34 -0700 Subject: tracing: Fix bad use of igrab in trace_uprobe.c As Miklos reported and suggested: This pattern repeats two times in trace_uprobe.c and in kernel/events/core.c as well: ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) goto fail_address_parse; inode = igrab(d_inode(path.dentry)); path_put(&path); And it's wrong. You can only hold a reference to the inode if you have an active ref to the superblock as well (which is normally through path.mnt) or holding s_umount. This way unmounting the containing filesystem while the tracepoint is active will give you the "VFS: Busy inodes after unmount..." message and a crash when the inode is finally put. Solution: store path instead of inode. This patch fixes two instances in trace_uprobe.c. struct path is added to struct trace_uprobe to keep the inode and containing mount point referenced. Link: http://lkml.kernel.org/r/20180423172135.4050588-1-songliubraving@fb.com Fixes: f3f096cfedf8 ("tracing: Provide trace events interface for uprobes") Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU") Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Howard McLauchlan Cc: Josef Bacik Cc: Srikar Dronamraju Acked-by: Miklos Szeredi Reported-by: Miklos Szeredi Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34fd0e0ec51d..ac892878dbe6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -55,6 +55,7 @@ struct trace_uprobe { struct list_head list; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; + struct path path; struct inode *inode; char *filename; unsigned long offset; @@ -289,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) for (i = 0; i < tu->tp.nr_args; i++) traceprobe_free_probe_arg(&tu->tp.args[i]); - iput(tu->inode); + path_put(&tu->path); kfree(tu->tp.call.class->system); kfree(tu->tp.call.name); kfree(tu->filename); @@ -363,7 +364,6 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - struct inode *inode; char *arg, *event, *group, *filename; char buf[MAX_EVENT_NAME_LEN]; struct path path; @@ -371,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv) bool is_delete, is_return; int i, ret; - inode = NULL; ret = 0; is_delete = false; is_return = false; @@ -437,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv) } /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(argv[1], ':'); - if (!arg) { - ret = -EINVAL; - goto fail_address_parse; - } + if (!arg) + return -EINVAL; *arg++ = '\0'; filename = argv[1]; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) - goto fail_address_parse; - - inode = igrab(d_real_inode(path.dentry)); - path_put(&path); + return ret; - if (!inode || !S_ISREG(inode->i_mode)) { + if (!d_is_reg(path.dentry)) { ret = -EINVAL; goto fail_address_parse; } @@ -490,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); if (!tu->filename) { @@ -558,7 +552,7 @@ error: return ret; fail_address_parse: - iput(inode); + path_put(&path); pr_info("Failed to parse address or file.\n"); @@ -922,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, goto err_flags; tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) goto err_buffer; @@ -967,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); @@ -1337,7 +1333,6 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) { struct trace_uprobe *tu; - struct inode *inode; struct path path; int ret; @@ -1345,11 +1340,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (ret) return ERR_PTR(ret); - inode = igrab(d_inode(path.dentry)); - path_put(&path); - - if (!inode || !S_ISREG(inode->i_mode)) { - iput(inode); + if (!d_is_reg(path.dentry)) { + path_put(&path); return ERR_PTR(-EINVAL); } @@ -1364,11 +1356,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + path_put(&path); return ERR_CAST(tu); } tu->offset = offs; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); -- cgit v1.2.3-59-g8ed1b From 61f94203c9efcaf44a7435298697caf406476c79 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Apr 2018 10:21:35 -0700 Subject: tracing: Remove igrab() iput() call from uprobes.c Caller of uprobe_register is required to keep the inode and containing mount point referenced. There was misuse of igrab() in uprobes.c and trace_uprobe.c. This is because igrab() will not prevent umount of the containing mount point. To fix this, we added path to struct trace_uprobe, which keeps the inode and containing mount reference. For uprobes.c, it is not necessary to call igrab() in uprobe_register(), as the caller is required to keep the inode reference. The igrab() is removed and comments on this requirement is added to uprobe_register(). Link: http://lkml.kernel.org/r/CAELBmZB2XX=qEOLAdvGG4cPx4GEntcSnWQquJLUK1ongRj35cA@mail.gmail.com Link: http://lkml.kernel.org/r/20180423172135.4050588-2-songliubraving@fb.com Cc: Ingo Molnar Cc: Howard McLauchlan Cc: Josef Bacik Cc: Srikar Dronamraju Acked-by: Miklos Szeredi Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/events/uprobes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce6848e46e94..1725b902983f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (!uprobe) return NULL; - uprobe->inode = igrab(inode); + uprobe->inode = inode; uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (cur_uprobe) { kfree(uprobe); uprobe = cur_uprobe; - iput(inode); } return uprobe; @@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe) rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - iput(uprobe->inode); put_uprobe(uprobe); } @@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. + * unregisters. Caller of uprobe_register() is required to keep @inode + * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) -- cgit v1.2.3-59-g8ed1b From 608940dabe1bd2ce4c97524004ec86637cf80f2c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 26 Apr 2018 20:04:47 -0500 Subject: tracing: Restore proper field flag printing when displaying triggers The flag-printing code used when displaying hist triggers somehow got dropped during refactoring of the inter-event patchset. This restores it. Below are a couple examples - in the first case, .usecs wasn't being displayed properly for common_timestamps and the second illustrates the same for other flags such as .execname. Before: # echo 'hist:key=common_pid.execname:val=count:sort=count' > /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger hist:keys=common_pid:vals=hitcount,count:sort=count:size=2048 [active] # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger hist:keys=pid:vals=hitcount:ts0=common_timestamp:sort=hitcount:size=2048:clock=global if comm=="cyclictest" [active] After: # echo 'hist:key=common_pid.execname:val=count:sort=count' > /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger hist:keys=common_pid.execname:vals=hitcount,count:sort=count:size=2048 [active] # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global if comm=="cyclictest" [active] Link: http://lkml.kernel.org/r/492bab42ff21806600af98a8ea901af10efbee0c.1524790601.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0d7b3ffbecc2..66c87be4ebb2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4913,6 +4913,16 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); + + if (hist_field->flags) { + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && + !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + const char *flags = get_hist_field_flags(hist_field); + + if (flags) + seq_printf(m, ".%s", flags); + } + } } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3-59-g8ed1b From 5ec432d7bf9dd3b4a2b84f8974e3adb71f45fb1d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 26 Apr 2018 20:04:48 -0500 Subject: tracing: Add field parsing hist error for hist triggers If the user specifies a nonexistent field for a hist trigger, the current code correctly flags that as an error, but doesn't tell the user what happened. Fix this by invoking hist_err() with an appropriate message when nonexistent fields are specified. Before: # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger -su: echo: write error: Invalid argument # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist After: # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger -su: echo: write error: Invalid argument # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist ERROR: Couldn't find field: pid Last command: keys=pid:ts0=common_timestamp.usecs Link: http://lkml.kernel.org/r/fdc8746969d16906120f162b99dd71c741e0b62c.1524790601.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Reported-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 66c87be4ebb2..f231fa2a3dcd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2481,6 +2481,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { + hist_err("Couldn't find field: ", field_name); field = ERR_PTR(-EINVAL); goto out; } -- cgit v1.2.3-59-g8ed1b From dcf234577cd31fa16874e828b90659166ad6b80d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 26 Apr 2018 20:04:49 -0500 Subject: tracing: Add field modifier parsing hist error for hist triggers If the user specifies an invalid field modifier for a hist trigger, the current code correctly flags that as an error, but doesn't tell the user what happened. Fix this by invoking hist_err() with an appropriate message when invalid modifiers are specified. Before: # echo 'hist:keys=pid:ts0=common_timestamp.junkusecs' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger -su: echo: write error: Invalid argument # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist After: # echo 'hist:keys=pid:ts0=common_timestamp.junkusecs' >> /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger -su: echo: write error: Invalid argument # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist ERROR: Invalid field modifier: junkusecs Last command: keys=pid:ts0=common_timestamp.junkusecs Link: http://lkml.kernel.org/r/b043c59fa79acd06a5f14a1d44dee9e5a3cd1248.1524790601.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f231fa2a3dcd..b9061ed59bbd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2466,6 +2466,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { + hist_err("Invalid field modifier: ", modifier); field = ERR_PTR(-EINVAL); goto out; } -- cgit v1.2.3-59-g8ed1b From d66a270be3310d7aa132fec0cea77d3d32a0ff75 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Mar 2018 08:44:24 -0400 Subject: tracepoint: Do not warn on ENOMEM Tracepoint should only warn when a kernel API user does not respect the required preconditions (e.g. same tracepoint enabled twice, or called to remove a tracepoint that does not exist). Silence warning in out-of-memory conditions, given that the error is returned to the caller. This ensures that out-of-memory error-injection testing does not trigger warnings in tracepoint.c, which were seen by syzbot. Link: https://lkml.kernel.org/r/001a114465e241a8720567419a72@google.com Link: https://lkml.kernel.org/r/001a1140e0de15fc910567464190@google.com Link: http://lkml.kernel.org/r/20180315124424.32319-1-mathieu.desnoyers@efficios.com CC: Peter Zijlstra CC: Jiri Olsa CC: Arnaldo Carvalho de Melo CC: Alexander Shishkin CC: Namhyung Kim CC: stable@vger.kernel.org Fixes: de7b2973903c6 ("tracepoint: Use struct pointer instead of name hash for reg/unreg tracepoints") Reported-by: syzbot+9c0d616860575a73166a@syzkaller.appspotmail.com Reported-by: syzbot+4e9ae7fa46233396f64d@syzkaller.appspotmail.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 671b13457387..1e37da2e0c25 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } -- cgit v1.2.3-59-g8ed1b From 3cc9a472d625f31f981063882b07e96229b9e71a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 May 2018 13:50:19 -0700 Subject: bpf: sockmap, fix scatterlist update on error path in send with apply When the call to do_tcp_sendpage() fails to send the complete block requested we either retry if only a partial send was completed or abort if we receive a error less than or equal to zero. Before returning though we must update the scatterlist length/offset to account for any partial send completed. Before this patch we did this at the end of the retry loop, but this was buggy when used while applying a verdict to fewer bytes than in the scatterlist. When the scatterlist length was being set we forgot to account for the apply logic reducing the size variable. So the result was we chopped off some bytes in the scatterlist without doing proper cleanup on them. This results in a WARNING when the sock is tore down because the bytes have previously been charged to the socket but are never uncharged. The simple fix is to simply do the accounting inside the retry loop subtracting from the absolute scatterlist values rather than trying to accumulate the totals and subtract at the end. Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 634415c7fbcd..943929a05c92 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -326,6 +326,9 @@ retry: if (ret > 0) { if (apply) apply_bytes -= ret; + + sg->offset += ret; + sg->length -= ret; size -= ret; offset += ret; if (uncharge) @@ -333,8 +336,6 @@ retry: goto retry; } - sg->length = size; - sg->offset = offset; return ret; } -- cgit v1.2.3-59-g8ed1b From fec51d40ea65dd8f51a3e27fc69b4e7dc4f17776 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 May 2018 13:50:24 -0700 Subject: bpf: sockmap, zero sg_size on error when buffer is released When an error occurs during a redirect we have two cases that need to be handled (i) we have a cork'ed buffer (ii) we have a normal sendmsg buffer. In the cork'ed buffer case we don't currently support recovering from errors in a redirect action. So the buffer is released and the error should _not_ be pushed back to the caller of sendmsg/sendpage. The rationale here is the user will get an error that relates to old data that may have been sent by some arbitrary thread on that sock. Instead we simple consume the data and tell the user that the data has been consumed. We may add proper error recovery in the future. However, this patch fixes a bug where the bytes outstanding counter sg_size was not zeroed. This could result in a case where if the user has both a cork'ed action and apply action in progress we may incorrectly call into the BPF program when the user expected an old verdict to be applied via the apply action. I don't have a use case where using apply and cork at the same time is valid but we never explicitly reject it because it should work fine. This patch ensures the sg_size is zeroed so we don't have this case. In the normal sendmsg buffer case (no cork data) we also do not zero sg_size. Again this can confuse the apply logic when the logic calls into the BPF program when the BPF programmer expected the old verdict to remain. So ensure we set sg_size to zero here as well. And additionally to keep the psock state in-sync with the sk_msg_buff release all the memory as well. Previously we did this before returning to the user but this left a gap where psock and sk_msg_buff states were out of sync which seems fragile. No additional overhead is taken here except for a call to check the length and realize its already been freed. This is in the error path as well so in my opinion lets have robust code over optimized error paths. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 943929a05c92..052c313b12db 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -701,15 +701,22 @@ more_data: err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); lock_sock(sk); + if (unlikely(err < 0)) { + free_start_sg(sk, m); + psock->sg_size = 0; + if (!cork) + *copied -= send; + } else { + psock->sg_size -= send; + } + if (cork) { free_start_sg(sk, m); + psock->sg_size = 0; kfree(m); m = NULL; + err = 0; } - if (unlikely(err)) - *copied -= err; - else - psock->sg_size -= send; break; case __SK_DROP: default: -- cgit v1.2.3-59-g8ed1b From abaeb096ca38cad02c8a68c49ddd7efc043c319a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 2 May 2018 13:50:29 -0700 Subject: bpf: sockmap, fix error handling in redirect failures When a redirect failure happens we release the buffers in-flight without calling a sk_mem_uncharge(), the uncharge is called before dropping the sock lock for the redirecte, however we missed updating the ring start index. When no apply actions are in progress this is OK because we uncharge the entire buffer before the redirect. But, when we have apply logic running its possible that only a portion of the buffer is being redirected. In this case we only do memory accounting for the buffer slice being redirected and expect to be able to loop over the BPF program again and/or if a sock is closed uncharge the memory at sock destruct time. With an invalid start index however the program logic looks at the start pointer index, checks the length, and when seeing the length is zero (from the initial release and failure to update the pointer) aborts without uncharging/releasing the remaining memory. The fix for this is simply to update the start index. To avoid fixing this error in two locations we do a small refactor and remove one case where it is open-coded. Then fix it in the single function. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 052c313b12db..098eca568c2b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -393,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) } while (i != md->sg_end); } -static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +static void free_bytes_sg(struct sock *sk, int bytes, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = md->sg_start, free; @@ -403,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (bytes < free) { sg[i].length -= bytes; sg[i].offset += bytes; - sk_mem_uncharge(sk, bytes); + if (charge) + sk_mem_uncharge(sk, bytes); break; } - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); put_page(sg_page(&sg[i])); bytes -= sg[i].length; sg[i].length = 0; @@ -418,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + md->sg_start = i; } static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) @@ -576,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) { + bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; struct scatterlist *sg; - int i, err, free = 0; - bool ingress = !!(md->flags & BPF_F_INGRESS); + int err = 0; sg = md->sg_data; @@ -607,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, out_rcu: rcu_read_unlock(); out: - i = md->sg_start; - while (sg[i].length) { - free += sg[i].length; - put_page(sg_page(&sg[i])); - sg[i].length = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } - return free; + free_bytes_sg(NULL, send, md, false); + return err; } static inline void bpf_md_init(struct smap_psock *psock) @@ -720,7 +716,7 @@ more_data: break; case __SK_DROP: default: - free_bytes_sg(sk, send, m); + free_bytes_sg(sk, send, m, true); apply_bytes_dec(psock, send); *copied -= send; psock->sg_size -= send; -- cgit v1.2.3-59-g8ed1b