From 245d73698ed7abdc7e520dfa38048bb80ce89571 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Oct 2019 16:41:58 -0700 Subject: audit: Report suspicious O_CREAT usage This renames the very specific audit_log_link_denied() to audit_log_path_denied() and adds the AUDIT_* type as an argument. This allows for the creation of the new AUDIT_ANOM_CREAT that can be used to report the fifo/regular file creation restrictions that were introduced in commit 30aba6656f61 ("namei: allow restricted O_CREAT of FIFOs and regular files"). Signed-off-by: Kees Cook Signed-off-by: Paul Moore --- kernel/audit.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index da8dc0db5bd3..d75485aa25ff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2155,18 +2155,19 @@ void audit_log_task_info(struct audit_buffer *ab) EXPORT_SYMBOL(audit_log_task_info); /** - * audit_log_link_denied - report a link restriction denial - * @operation: specific link operation + * audit_log_path_denied - report a path restriction denial + * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) + * @operation: specific operation name */ -void audit_log_link_denied(const char *operation) +void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled || audit_dummy_context()) return; - /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ - ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK); + /* Generate log with subject, operation, outcome. */ + ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); -- cgit v1.2.3-59-g8ed1b From 0f8b5b6d56b5fa4085c06945ea3e1ee5941ecfeb Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:17 -0700 Subject: kgdb: Remove unused DCPU_SSTEP definition From doing a 'git log --patch kernel/debug', it looks as if DCPU_SSTEP has never been used. Presumably it used to be used back when kgdb was out of tree and nobody thought to delete the definition when the usage went away. Delete. Signed-off-by: Douglas Anderson Acked-by: Jason Wessel Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index b4a7c326d546..804b0fe5a0ba 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -33,7 +33,6 @@ struct kgdb_state { #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ -#define DCPU_SSTEP 0x8 /* CPU is single stepping */ struct debuggerinfo_struct { void *debuggerinfo; -- cgit v1.2.3-59-g8ed1b From 54af3e39eed7d77f0923511f3c7f446e7d477635 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:18 -0700 Subject: kdb: Remove unused "argcount" param from kdb_bt1(); make btaprompt bool The kdb_bt1() had a mysterious "argcount" parameter passed in (always the number 5, by the way) and never used. Presumably this is just old cruft. Remove it. While at it, upgrade the btaprompt parameter to a full fledged bool instead of an int. Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7e2379aa0a1e..120fc686c919 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -78,8 +78,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) */ static int -kdb_bt1(struct task_struct *p, unsigned long mask, - int argcount, int btaprompt) +kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) { char buffer[2]; if (kdb_getarea(buffer[0], (unsigned long)p) || @@ -106,7 +105,6 @@ int kdb_bt(int argc, const char **argv) { int diag; - int argcount = 5; int btaprompt = 1; int nextarg; unsigned long addr; @@ -125,7 +123,7 @@ kdb_bt(int argc, const char **argv) /* Run the active tasks first */ for_each_online_cpu(cpu) { p = kdb_curr_task(cpu); - if (kdb_bt1(p, mask, argcount, btaprompt)) + if (kdb_bt1(p, mask, btaprompt)) return 0; } /* Now the inactive tasks */ @@ -134,7 +132,7 @@ kdb_bt(int argc, const char **argv) return 0; if (task_curr(p)) continue; - if (kdb_bt1(p, mask, argcount, btaprompt)) + if (kdb_bt1(p, mask, btaprompt)) return 0; } kdb_while_each_thread(g, p); } else if (strcmp(argv[0], "btp") == 0) { @@ -148,7 +146,7 @@ kdb_bt(int argc, const char **argv) p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) { kdb_set_current_task(p); - return kdb_bt1(p, ~0UL, argcount, 0); + return kdb_bt1(p, ~0UL, false); } kdb_printf("No process with pid == %ld found\n", pid); return 0; @@ -159,7 +157,7 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; kdb_set_current_task((struct task_struct *)addr); - return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0); + return kdb_bt1((struct task_struct *)addr, ~0UL, false); } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; struct task_struct *save_current_task = kdb_current_task; @@ -211,7 +209,7 @@ kdb_bt(int argc, const char **argv) kdb_show_stack(kdb_current_task, (void *)addr); return 0; } else { - return kdb_bt1(kdb_current_task, ~0UL, argcount, 0); + return kdb_bt1(kdb_current_task, ~0UL, false); } } -- cgit v1.2.3-59-g8ed1b From 55a7e23f461fc2c321d7efcdeca1750085e9323f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:19 -0700 Subject: kdb: Fix "btc " crash if the CPU didn't round up I noticed that when I did "btc " and the CPU I passed in hadn't rounded up that I'd crash. I was going to copy the same fix from commit 162bc7f5afd7 ("kdb: Don't back trace on a cpu that didn't round up") into the "not all the CPUs" case, but decided it'd be better to clean things up a little bit. This consolidates the two code paths. It is _slightly_ wasteful in in that the checks for "cpu" being too small or being offline isn't really needed when we're iterating over all online CPUs, but that really shouldn't hurt. Better to have the same code path. While at it, eliminate at least one slightly ugly (and totally needless) recursive use of kdb_parse(). Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 61 ++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 120fc686c919..d9af139f9a31 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -101,6 +101,27 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) return 0; } +static void +kdb_bt_cpu(unsigned long cpu) +{ + struct task_struct *kdb_tsk; + + if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { + kdb_printf("WARNING: no process for cpu %ld\n", cpu); + return; + } + + /* If a CPU failed to round up we could be here */ + kdb_tsk = KDB_TSK(cpu); + if (!kdb_tsk) { + kdb_printf("WARNING: no task for cpu %ld\n", cpu); + return; + } + + kdb_set_current_task(kdb_tsk); + kdb_bt1(kdb_tsk, ~0UL, false); +} + int kdb_bt(int argc, const char **argv) { @@ -161,7 +182,6 @@ kdb_bt(int argc, const char **argv) } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; struct task_struct *save_current_task = kdb_current_task; - char buf[80]; if (argc > 1) return KDB_ARGCOUNT; if (argc == 1) { @@ -169,35 +189,22 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; } - /* Recursive use of kdb_parse, do not use argv after - * this point */ - argv = NULL; if (cpu != ~0) { - if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { - kdb_printf("no process for cpu %ld\n", cpu); - return 0; - } - sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); - kdb_parse(buf); - return 0; - } - kdb_printf("btc: cpu status: "); - kdb_parse("cpu\n"); - for_each_online_cpu(cpu) { - void *kdb_tsk = KDB_TSK(cpu); - - /* If a CPU failed to round up we could be here */ - if (!kdb_tsk) { - kdb_printf("WARNING: no task for cpu %ld\n", - cpu); - continue; + kdb_bt_cpu(cpu); + } else { + /* + * Recursive use of kdb_parse, do not use argv after + * this point. + */ + argv = NULL; + kdb_printf("btc: cpu status: "); + kdb_parse("cpu\n"); + for_each_online_cpu(cpu) { + kdb_bt_cpu(cpu); + touch_nmi_watchdog(); } - - sprintf(buf, "btt 0x%px\n", kdb_tsk); - kdb_parse(buf); - touch_nmi_watchdog(); + kdb_set_current_task(save_current_task); } - kdb_set_current_task(save_current_task); return 0; } else { if (argc) { -- cgit v1.2.3-59-g8ed1b From 2277b492582d5525244519f60da6f9daea5ef41a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:20 -0700 Subject: kdb: Fix stack crawling on 'running' CPUs that aren't the master In kdb when you do 'btc' (back trace on CPU) it doesn't necessarily give you the right info. Specifically on many architectures (including arm64, where I tested) you can't dump the stack of a "running" process that isn't the process running on the current CPU. This can be seen by this: echo SOFTLOCKUP > /sys/kernel/debug/provoke-crash/DIRECT # wait 2 seconds g Here's what I see now on rk3399-gru-kevin. I see the stack crawl for the CPU that handled the sysrq but everything else just shows me stuck in __switch_to() which is bogus: ====== [0]kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0, 1-3(I), 4, 5(I) Stack traceback for pid 0 0xffffff801101a9c0 0 0 1 0 R 0xffffff801101b3b0 *swapper/0 Call trace: dump_backtrace+0x0/0x138 ... kgdb_compiled_brk_fn+0x34/0x44 ... sysrq_handle_dbg+0x34/0x5c Stack traceback for pid 0 0xffffffc0f175a040 0 0 1 1 I 0xffffffc0f175aa30 swapper/1 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65616c0 Stack traceback for pid 0 0xffffffc0f175d040 0 0 1 2 I 0xffffffc0f175da30 swapper/2 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65806c0 Stack traceback for pid 0 0xffffffc0f175b040 0 0 1 3 I 0xffffffc0f175ba30 swapper/3 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f659f6c0 Stack traceback for pid 1474 0xffffffc0dde8b040 1474 727 1 4 R 0xffffffc0dde8ba30 bash Call trace: __switch_to+0x1e4/0x240 __schedule+0x464/0x618 0xffffffc0dde8b040 Stack traceback for pid 0 0xffffffc0f17b0040 0 0 1 5 I 0xffffffc0f17b0a30 swapper/5 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65dd6c0 === The problem is that 'btc' eventually boils down to show_stack(task_struct, NULL); ...and show_stack() doesn't work for "running" CPUs because their registers haven't been stashed. On x86 things might work better (I haven't tested) because kdb has a special case for x86 in kdb_show_stack() where it passes the stack pointer to show_stack(). This wouldn't work on arm64 where the stack crawling function seems needs the "fp" and "pc", not the "sp" which is presumably why arm64's show_stack() function totally ignores the "sp" parameter. NOTE: we _can_ get a good stack dump for all the cpus if we manually switch each one to the kdb master and do a back trace. AKA: cpu 4 bt ...will give the expected trace. That's because now arm64's dump_backtrace will now see that "tsk == current" and go through a different path. In this patch I fix the problems by catching a request to stack crawl a task that's running on a CPU and then I ask that CPU to do the stack crawl. NOTE: this will (presumably) change what stack crawls are printed for x86 machines. Now kdb functions will show up in the stack crawl. Presumably this is OK but if it's not we can go back and add a special case for x86 again. Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 34 ++++++++++++++++++++++++++++++++++ kernel/debug/debug_core.h | 2 ++ kernel/debug/kdb/kdb_bt.c | 19 +++++++------------ 3 files changed, 43 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index f76d6f77dd5e..70e86b4b4932 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -441,6 +441,37 @@ setundefined: return 0; } +#ifdef CONFIG_KGDB_KDB +void kdb_dump_stack_on_cpu(int cpu) +{ + if (cpu == raw_smp_processor_id()) { + dump_stack(); + return; + } + + if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) { + kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n", + cpu); + return; + } + + /* + * In general, architectures don't support dumping the stack of a + * "running" process that's not the current one. From the point of + * view of the Linux, kernel processes that are looping in the kgdb + * slave loop are still "running". There's also no API (that actually + * works across all architectures) that can do a stack crawl based + * on registers passed as a parameter. + * + * Solve this conundrum by asking slave CPUs to do the backtrace + * themselves. + */ + kgdb_info[cpu].exception_state |= DCPU_WANT_BT; + while (kgdb_info[cpu].exception_state & DCPU_WANT_BT) + cpu_relax(); +} +#endif + /* * Return true if there is a valid kgdb I/O module. Also if no * debugger is attached a message can be printed to the console about @@ -580,6 +611,9 @@ cpu_loop: atomic_xchg(&kgdb_active, cpu); break; } + } else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) { + dump_stack(); + kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT; } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { if (!raw_spin_is_locked(&dbg_slave_lock)) goto return_normal; diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 804b0fe5a0ba..cd22b5f68831 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -33,6 +33,7 @@ struct kgdb_state { #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ +#define DCPU_WANT_BT 0x8 /* Slave cpu should backtrace then clear flag */ struct debuggerinfo_struct { void *debuggerinfo; @@ -75,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); +extern void kdb_dump_stack_on_cpu(int cpu); #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index d9af139f9a31..0e94efe07b72 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -22,20 +22,15 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - kdb_set_current_task(p); - if (addr) { - show_stack((struct task_struct *)p, addr); - } else if (kdb_current_regs) { -#ifdef CONFIG_X86 - show_stack(p, &kdb_current_regs->sp); -#else - show_stack(p, NULL); -#endif - } else { - show_stack(p, NULL); - } + + if (!addr && kdb_task_has_cpu(p)) + kdb_dump_stack_on_cpu(kdb_process_cpu(p)); + else + show_stack(p, addr); + console_loglevel = old_lvl; kdb_trap_printk--; } -- cgit v1.2.3-59-g8ed1b From fb3c5386b382d4097476ce9647260fc89b34afdb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 20 Sep 2019 10:30:05 +0200 Subject: seccomp: add SECCOMP_USER_NOTIF_FLAG_CONTINUE This allows the seccomp notifier to continue a syscall. A positive discussion about this feature was triggered by a post to the ksummit-discuss mailing list (cf. [3]) and took place during KSummit (cf. [1]) and again at the containers/checkpoint-restore micro-conference at Linux Plumbers. Recently we landed seccomp support for SECCOMP_RET_USER_NOTIF (cf. [4]) which enables a process (watchee) to retrieve an fd for its seccomp filter. This fd can then be handed to another (usually more privileged) process (watcher). The watcher will then be able to receive seccomp messages about the syscalls having been performed by the watchee. This feature is heavily used in some userspace workloads. For example, it is currently used to intercept mknod() syscalls in user namespaces aka in containers. The mknod() syscall can be easily filtered based on dev_t. This allows us to only intercept a very specific subset of mknod() syscalls. Furthermore, mknod() is not possible in user namespaces toto coelo and so intercepting and denying syscalls that are not in the whitelist on accident is not a big deal. The watchee won't notice a difference. In contrast to mknod(), a lot of other syscall we intercept (e.g. setxattr()) cannot be easily filtered like mknod() because they have pointer arguments. Additionally, some of them might actually succeed in user namespaces (e.g. setxattr() for all "user.*" xattrs). Since we currently cannot tell seccomp to continue from a user notifier we are stuck with performing all of the syscalls in lieu of the container. This is a huge security liability since it is extremely difficult to correctly assume all of the necessary privileges of the calling task such that the syscall can be successfully emulated without escaping other additional security restrictions (think missing CAP_MKNOD for mknod(), or MS_NODEV on a filesystem etc.). This can be solved by telling seccomp to resume the syscall. One thing that came up in the discussion was the problem that another thread could change the memory after userspace has decided to let the syscall continue which is a well known TOCTOU with seccomp which is present in other ways already. The discussion showed that this feature is already very useful for any syscall without pointer arguments. For any accidentally intercepted non-pointer syscall it is safe to continue. For syscalls with pointer arguments there is a race but for any cautious userspace and the main usec cases the race doesn't matter. The notifier is intended to be used in a scenario where a more privileged watcher supervises the syscalls of lesser privileged watchee to allow it to get around kernel-enforced limitations by performing the syscall for it whenever deemed save by the watcher. Hence, if a user tricks the watcher into allowing a syscall they will either get a deny based on kernel-enforced restrictions later or they will have changed the arguments in such a way that they manage to perform a syscall with arguments that they would've been allowed to do anyway. In general, it is good to point out again, that the notifier fd was not intended to allow userspace to implement a security policy but rather to work around kernel security mechanisms in cases where the watcher knows that a given action is safe to perform. /* References */ [1]: https://linuxplumbersconf.org/event/4/contributions/560 [2]: https://linuxplumbersconf.org/event/4/contributions/477 [3]: https://lore.kernel.org/r/20190719093538.dhyopljyr5ns33qx@brauner.io [4]: commit 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Co-developed-by: Kees Cook Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Cc: Andy Lutomirski Cc: Will Drewry CC: Tyler Hicks Link: https://lore.kernel.org/r/20190920083007.11475-2-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 29 +++++++++++++++++++++++++++++ kernel/seccomp.c | 28 ++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 90734aa5aa36..e48e2fa2d248 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -76,6 +76,35 @@ struct seccomp_notif { struct seccomp_data data; }; +/* + * Valid flags for struct seccomp_notif_resp + * + * Note, the SECCOMP_USER_NOTIF_FLAG_CONTINUE flag must be used with caution! + * If set by the process supervising the syscalls of another process the + * syscall will continue. This is problematic because of an inherent TOCTOU. + * An attacker can exploit the time while the supervised process is waiting on + * a response from the supervising process to rewrite syscall arguments which + * are passed as pointers of the intercepted syscall. + * It should be absolutely clear that this means that the seccomp notifier + * _cannot_ be used to implement a security policy! It should only ever be used + * in scenarios where a more privileged process supervises the syscalls of a + * lesser privileged process to get around kernel-enforced security + * restrictions when the privileged process deems this safe. In other words, + * in order to continue a syscall the supervising process should be sure that + * another security mechanism or the kernel itself will sufficiently block + * syscalls if arguments are rewritten to something unsafe. + * + * Similar precautions should be applied when stacking SECCOMP_RET_USER_NOTIF + * or SECCOMP_RET_TRACE. For SECCOMP_RET_USER_NOTIF filters acting on the + * same syscall, the most recently added filter takes precedence. This means + * that the new SECCOMP_RET_USER_NOTIF filter can override any + * SECCOMP_IOCTL_NOTIF_SEND from earlier filters, essentially allowing all + * such filtered syscalls to be executed by sending the response + * SECCOMP_USER_NOTIF_FLAG_CONTINUE. Note that SECCOMP_RET_TRACE can equally + * be overriden by SECCOMP_USER_NOTIF_FLAG_CONTINUE. + */ +#define SECCOMP_USER_NOTIF_FLAG_CONTINUE BIT(0) + struct seccomp_notif_resp { __u64 id; __s64 val; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dba52a7db5e8..12d2227e5786 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -75,6 +75,7 @@ struct seccomp_knotif { /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ int error; long val; + u32 flags; /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ struct completion ready; @@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_do_user_notification(int this_syscall, - struct seccomp_filter *match, - const struct seccomp_data *sd) +static int seccomp_do_user_notification(int this_syscall, + struct seccomp_filter *match, + const struct seccomp_data *sd) { int err; + u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; @@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall, if (err == 0) { ret = n.val; err = n.error; + flags = n.flags; } /* @@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall, list_del(&n.list); out: mutex_unlock(&match->notify_lock); + + /* Userspace requests to continue the syscall. */ + if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return 0; + syscall_set_return_value(current, task_pt_regs(current), err, ret); + return -1; } static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, @@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_USER_NOTIF: - seccomp_do_user_notification(this_syscall, match, sd); - goto skip; + if (seccomp_do_user_notification(this_syscall, match, sd)) + goto skip; + + return 0; case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); @@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (copy_from_user(&resp, buf, sizeof(resp))) return -EFAULT; - if (resp.flags) + if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return -EINVAL; + + if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) && + (resp.error || resp.val)) return -EINVAL; ret = mutex_lock_interruptible(&filter->notify_lock); @@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = resp.error; knotif->val = resp.val; + knotif->flags = resp.flags; complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); -- cgit v1.2.3-59-g8ed1b From da6043fe85eb5ec621e34a92540735dcebbea134 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 25 Sep 2019 15:39:12 +0100 Subject: PM / hibernate: memory_bm_find_bit(): Tighten node optimisation When looking for a bit by number we make use of the cached result from the preceding lookup to speed up operation. Firstly we check if the requested pfn is within the cached zone and if not lookup the new zone. We then check if the offset for that pfn falls within the existing cached node. This happens regardless of whether the node is within the zone we are now scanning. With certain memory layouts it is possible for this to false trigger creating a temporary alias for the pfn to a different bit. This leads the hibernation code to free memory which it was never allocated with the expected fallout. Ensure the zone we are scanning matches the cached zone before considering the cached node. Deep thanks go to Andrea for many, many, many hours of hacking and testing that went into cornering this bug. Reported-by: Andrea Righi Tested-by: Andrea Righi Signed-off-by: Andy Whitcroft Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 83105874f255..26b9168321e7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -734,8 +734,15 @@ zone_found: * We have found the zone. Now walk the radix tree to find the leaf node * for our PFN. */ + + /* + * If the zone we wish to scan is the the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ node = bm->cur.node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; -- cgit v1.2.3-59-g8ed1b From ce4dd4429b3c7e4506870796f3b8b06d707d2928 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Oct 2019 15:13:41 +0100 Subject: Remove the nr_exclusive argument from __wake_up_sync_key() Remove the nr_exclusive argument from __wake_up_sync_key() and derived functions as everything seems to set it to 1. Note also that if it wasn't set to 1, it would clear WF_SYNC anyway. Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) --- include/linux/wait.h | 8 ++++---- kernel/exit.c | 2 +- kernel/sched/wait.c | 14 ++++---------- 3 files changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 3eb7cae8206c..bb7676d396cd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -201,9 +201,9 @@ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); -void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); -void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -214,7 +214,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) -#define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) /* * Wakeup macros to be used to report events to the targets. @@ -228,7 +228,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ - __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, poll_to_key(m)) + __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define ___wait_cond_timeout(condition) \ ({ \ diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..a1ff25ef050e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1435,7 +1435,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); + TASK_INTERRUPTIBLE, p); } static long do_wait(struct wait_opts *wo) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c1e566a114ca..b4b52361dab7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule @@ -183,26 +182,21 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) + void *key) { - int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!wq_head)) return; - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); + __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) { - __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ -- cgit v1.2.3-59-g8ed1b From d07ce4e32a8d68062c58a3e635619313c52d0bf7 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Mon, 21 Oct 2019 11:10:56 +0100 Subject: kdb: Avoid array subscript warnings on non-SMP builds Recent versions of gcc (reported on gcc-7.4) issue array subscript warnings for builds where SMP is not enabled. kernel/debug/debug_core.c: In function 'kdb_dump_stack_on_cpu': kernel/debug/debug_core.c:452:17: warning: array subscript is outside array +bounds [-Warray-bounds] if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) { ~~~~~~~~~^~~~~ kernel/debug/debug_core.c:469:33: warning: array subscript is outside array +bounds [-Warray-bounds] kgdb_info[cpu].exception_state |= DCPU_WANT_BT; kernel/debug/debug_core.c:470:18: warning: array subscript is outside array +bounds [-Warray-bounds] while (kgdb_info[cpu].exception_state & DCPU_WANT_BT) There is no bug here but there is scope to improve the code generation for non-SMP systems (whilst also silencing the warning). Reported-by: kbuild test robot Fixes: 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master") Signed-off-by: Daniel Thompson Link: https://lore.kernel.org/r/20191021101057.23861-1-daniel.thompson@linaro.org Reviewed-by: Douglas Anderson --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 70e86b4b4932..2b7c9b67931d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -444,7 +444,7 @@ setundefined: #ifdef CONFIG_KGDB_KDB void kdb_dump_stack_on_cpu(int cpu) { - if (cpu == raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) { dump_stack(); return; } -- cgit v1.2.3-59-g8ed1b From c34c78dfc1fc68a1f5403f996de8ca62f298d7b2 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Wed, 23 Oct 2019 21:27:34 +0800 Subject: audit: remove redundant condition check in kauditd_thread() Warning is found by the code analysis tool: "the condition 'if(ac && rc < 0)' is redundant: ac" The @ac variable has been checked before. It can't be a null pointer here, so remove the redundant condition check. Signed-off-by: Yunfeng Ye Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d75485aa25ff..8e09f0f55b4b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -830,7 +830,7 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); - if (ac && rc < 0) { + if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; @@ -840,7 +840,7 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); - if (ac && rc < 0) { + if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; -- cgit v1.2.3-59-g8ed1b From 53b63136e81220cb2f8b541c03a1df9199896821 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:24 +0100 Subject: kdb: Tidy up code to handle escape sequences kdb_read_get_key() has extremely complex break/continue control flow managed by state variables and is very hard to review or modify. In particular the way the escape sequence handling interacts with the general control flow is hard to follow. Separate out the escape key handling, without changing the control flow. This makes the main body of the code easier to review. Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-2-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 128 ++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 3a5184eb6977..cfc054fd8097 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -49,6 +49,65 @@ static int kgdb_transition_check(char *buffer) return 0; } +/** + * kdb_handle_escape() - validity check on an accumulated escape sequence. + * @buf: Accumulated escape characters to be examined. Note that buf + * is not a string, it is an array of characters and need not be + * nil terminated. + * @sz: Number of accumulated escape characters. + * + * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete, + * otherwise it returns a mapped key value to pass to the upper layers. + */ +static int kdb_handle_escape(char *buf, size_t sz) +{ + char *lastkey = buf + sz - 1; + + switch (sz) { + case 1: + if (*lastkey == '\e') + return 0; + break; + + case 2: /* \e */ + if (*lastkey == '[') + return 0; + break; + + case 3: + switch (*lastkey) { + case 'A': /* \e[A, up arrow */ + return 16; + case 'B': /* \e[B, down arrow */ + return 14; + case 'C': /* \e[C, right arrow */ + return 6; + case 'D': /* \e[D, left arrow */ + return 2; + case '1': /* \e[<1,3,4>], may be home, del, end */ + case '3': + case '4': + return 0; + } + break; + + case 4: + if (*lastkey == '~') { + switch (buf[2]) { + case '1': /* \e[1~, home */ + return 1; + case '3': /* \e[3~, del */ + return 4; + case '4': /* \e[4~, end */ + return 5; + } + } + break; + } + + return -1; +} + static int kdb_read_get_key(char *buffer, size_t bufsize) { #define ESCAPE_UDELAY 1000 @@ -102,68 +161,15 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) escape_delay = 2; continue; } - if (ped - escape_data == 1) { - /* \e */ - continue; - } else if (ped - escape_data == 2) { - /* \e */ - if (key != '[') - escape_delay = 2; - continue; - } else if (ped - escape_data == 3) { - /* \e[ */ - int mapkey = 0; - switch (key) { - case 'A': /* \e[A, up arrow */ - mapkey = 16; - break; - case 'B': /* \e[B, down arrow */ - mapkey = 14; - break; - case 'C': /* \e[C, right arrow */ - mapkey = 6; - break; - case 'D': /* \e[D, left arrow */ - mapkey = 2; - break; - case '1': /* dropthrough */ - case '3': /* dropthrough */ - /* \e[<1,3,4>], may be home, del, end */ - case '4': - mapkey = -1; - break; - } - if (mapkey != -1) { - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - } - continue; - } else if (ped - escape_data == 4) { - /* \e[<1,3,4> */ - int mapkey = 0; - if (key == '~') { - switch (escape_data[2]) { - case '1': /* \e[1~, home */ - mapkey = 1; - break; - case '3': /* \e[3~, del */ - mapkey = 4; - break; - case '4': /* \e[4~, end */ - mapkey = 5; - break; - } - } - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - continue; + + key = kdb_handle_escape(escape_data, ped - escape_data); + if (key > 0) { + escape_data[0] = key; + escape_data[1] = '\0'; } + if (key) + escape_delay = 2; + continue; } break; /* A key to process */ } -- cgit v1.2.3-59-g8ed1b From d04213af90935d8b247c1327c9ea142fc037165f Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:25 +0100 Subject: kdb: Simplify code to fetch characters from console Currently kdb_read_get_key() contains complex control flow that, on close inspection, turns out to be unnecessary. In particular: 1. It is impossible to enter the branch conditioned on (escape_delay == 1) except when the loop enters with (escape_delay == 2) allowing us to combine the branches. 2. Most of the code conditioned on (escape_delay == 2) simply modifies local data and then breaks out of the loop causing the function to return escape_data[0]. 3. Based on #2 there is not actually any need to ever explicitly set escape_delay to 2 because we it is much simpler to directly return escape_data[0] instead. 4. escape_data[0] is, for all but one exit path, known to be '\e'. Simplify the code based on these observations. There is a subtle (and harmless) change of behaviour resulting from this simplification: instead of letting the escape timeout after ~1998 milliseconds we now timeout after ~2000 milliseconds Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-3-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index cfc054fd8097..a92ceca29637 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -124,25 +124,18 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) touch_nmi_watchdog(); f = &kdb_poll_funcs[0]; } - if (escape_delay == 2) { - *ped = '\0'; - ped = escape_data; - --escape_delay; - } - if (escape_delay == 1) { - key = *ped++; - if (!*ped) - --escape_delay; - break; - } + key = (*f)(); + if (key == -1) { if (escape_delay) { udelay(ESCAPE_UDELAY); - --escape_delay; + if (--escape_delay == 0) + return '\e'; } continue; } + if (bufsize <= 2) { if (key == '\r') key = '\n'; @@ -150,27 +143,24 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) *buffer = '\0'; return -1; } + if (escape_delay == 0 && key == '\e') { escape_delay = ESCAPE_DELAY; ped = escape_data; f_escape = f; } if (escape_delay) { - *ped++ = key; - if (f_escape != f) { - escape_delay = 2; - continue; - } + if (f_escape != f) + return '\e'; + *ped++ = key; key = kdb_handle_escape(escape_data, ped - escape_data); - if (key > 0) { - escape_data[0] = key; - escape_data[1] = '\0'; - } - if (key) - escape_delay = 2; - continue; + if (key < 0) + return '\e'; + if (key == 0) + continue; } + break; /* A key to process */ } return key; -- cgit v1.2.3-59-g8ed1b From 4f27e824bf83dfc2f6dc1a54fae419be7cd335af Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:26 +0100 Subject: kdb: Remove special case logic from kdb_read() kdb_read() contains special case logic to force it exit after reading a single character. We can remove all the special case logic by directly calling the function to read a single character instead. This also allows us to tidy up the function prototype which, because it now matches getchar(), we can also rename in order to make its role clearer. This does involve some extra code to handle btaprompt properly but we don't mind the new lines of code here because the old code had some interesting problems (bad newline handling, treating unexpected characters like ). Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-4-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_bt.c | 22 +++++++++------ kernel/debug/kdb/kdb_io.c | 61 +++++++++++++++++++----------------------- kernel/debug/kdb/kdb_private.h | 1 + 3 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 0e94efe07b72..4af48ac53625 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -75,9 +75,10 @@ static void kdb_show_stack(struct task_struct *p, void *addr) static int kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) { - char buffer[2]; - if (kdb_getarea(buffer[0], (unsigned long)p) || - kdb_getarea(buffer[0], (unsigned long)(p+1)-1)) + char ch; + + if (kdb_getarea(ch, (unsigned long)p) || + kdb_getarea(ch, (unsigned long)(p+1)-1)) return KDB_BADADDR; if (!kdb_task_state(p, mask)) return 0; @@ -85,12 +86,17 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) kdb_ps1(p); kdb_show_stack(p, NULL); if (btaprompt) { - kdb_getstr(buffer, sizeof(buffer), - "Enter to end, to continue:"); - if (buffer[0] == 'q') { - kdb_printf("\n"); + kdb_printf("Enter to end, or to continue:"); + do { + ch = kdb_getchar(); + } while (!strchr("\r\n q", ch)); + kdb_printf("\n"); + + /* reset the pager */ + kdb_nextline = 1; + + if (ch == 'q') return 1; - } } touch_nmi_watchdog(); return 0; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index a92ceca29637..9b6933d585b5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -108,7 +108,22 @@ static int kdb_handle_escape(char *buf, size_t sz) return -1; } -static int kdb_read_get_key(char *buffer, size_t bufsize) +/** + * kdb_getchar() - Read a single character from a kdb console (or consoles). + * + * Other than polling the various consoles that are currently enabled, + * most of the work done in this function is dealing with escape sequences. + * + * An escape key could be the start of a vt100 control sequence such as \e[D + * (left arrow) or it could be a character in its own right. The standard + * method for detecting the difference is to wait for 2 seconds to see if there + * are any other characters. kdb is complicated by the lack of a timer service + * (interrupts are off), by multiple input sources. Escape sequence processing + * has to be done as states in the polling loop. + * + * Return: The key pressed or a control code derived from an escape sequence. + */ +char kdb_getchar(void) { #define ESCAPE_UDELAY 1000 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ @@ -126,7 +141,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) } key = (*f)(); - if (key == -1) { if (escape_delay) { udelay(ESCAPE_UDELAY); @@ -136,14 +150,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) continue; } - if (bufsize <= 2) { - if (key == '\r') - key = '\n'; - *buffer++ = key; - *buffer = '\0'; - return -1; - } - if (escape_delay == 0 && key == '\e') { escape_delay = ESCAPE_DELAY; ped = escape_data; @@ -184,17 +190,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) * function. It is not reentrant - it relies on the fact * that while kdb is running on only one "master debug" cpu. * Remarks: - * - * The buffer size must be >= 2. A buffer size of 2 means that the caller only - * wants a single key. - * - * An escape key could be the start of a vt100 control sequence such as \e[D - * (left arrow) or it could be a character in its own right. The standard - * method for detecting the difference is to wait for 2 seconds to see if there - * are any other characters. kdb is complicated by the lack of a timer service - * (interrupts are off), by multiple input sources and by the need to sometimes - * return after just one key. Escape sequence processing has to be done as - * states in the polling loop. + * The buffer size must be >= 2. */ static char *kdb_read(char *buffer, size_t bufsize) @@ -229,9 +225,7 @@ static char *kdb_read(char *buffer, size_t bufsize) *cp = '\0'; kdb_printf("%s", buffer); poll_again: - key = kdb_read_get_key(buffer, bufsize); - if (key == -1) - return buffer; + key = kdb_getchar(); if (key != 9) tab = 0; switch (key) { @@ -742,7 +736,7 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline >= linecount) { - char buf1[16] = ""; + char ch; /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -777,39 +771,38 @@ kdb_printit: if (logging) printk("%s", moreprompt); - kdb_read(buf1, 2); /* '2' indicates to return - * immediately after getting one key. */ + ch = kdb_getchar(); kdb_nextline = 1; /* Really set output line 1 */ /* empty and reset the buffer: */ kdb_buffer[0] = '\0'; next_avail = kdb_buffer; size_avail = sizeof(kdb_buffer); - if ((buf1[0] == 'q') || (buf1[0] == 'Q')) { + if ((ch == 'q') || (ch == 'Q')) { /* user hit q or Q */ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */ KDB_STATE_CLEAR(PAGER); /* end of command output; back to normal mode */ kdb_grepping_flag = 0; kdb_printf("\n"); - } else if (buf1[0] == ' ') { + } else if (ch == ' ') { kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '\n') { + } else if (ch == '\n' || ch == '\r') { kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '/' && !kdb_grepping_flag) { + } else if (ch == '/' && !kdb_grepping_flag) { kdb_printf("\r"); kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, kdbgetenv("SEARCHPROMPT") ?: "search> "); *strchrnul(kdb_grep_string, '\n') = '\0'; kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] && buf1[0] != '\n') { - /* user hit something other than enter */ + } else if (ch) { + /* user hit something unexpected */ suspend_grep = 1; /* for this recursion */ - if (buf1[0] != '/') + if (ch != '/') kdb_printf( "\nOnly 'q', 'Q' or '/' are processed at " "more prompt, input ignored\n"); diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2118d8258b7c..55d052061ef9 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -210,6 +210,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig(struct task_struct *p, int sig); extern void kdb_meminfo_proc_show(void); +extern char kdb_getchar(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); -- cgit v1.2.3-59-g8ed1b From cdca8d8900dd33ce6b8b526e247d2a6009d05de0 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:27 +0100 Subject: kdb: Improve handling of characters from different input sources Currently if an escape timer is interrupted by a character from a different input source then the new character is discarded and the function returns '\e' (which will be discarded by the level above). It is hard to see why this would ever be the desired behaviour. Fix this to return the new character rather than the '\e'. This is a bigger refactor than might be expected because the new character needs to go through escape sequence detection. Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-5-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b6933d585b5..f794c0ca4557 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -127,10 +127,10 @@ char kdb_getchar(void) { #define ESCAPE_UDELAY 1000 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ - char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */ - char *ped = escape_data; + char buf[4]; /* longest vt100 escape sequence is 4 bytes */ + char *pbuf = buf; int escape_delay = 0; - get_char_func *f, *f_escape = NULL; + get_char_func *f, *f_prev = NULL; int key; for (f = &kdb_poll_funcs[0]; ; ++f) { @@ -150,26 +150,26 @@ char kdb_getchar(void) continue; } - if (escape_delay == 0 && key == '\e') { + /* + * When the first character is received (or we get a change + * input source) we set ourselves up to handle an escape + * sequences (just in case). + */ + if (f_prev != f) { + f_prev = f; + pbuf = buf; escape_delay = ESCAPE_DELAY; - ped = escape_data; - f_escape = f; - } - if (escape_delay) { - if (f_escape != f) - return '\e'; - - *ped++ = key; - key = kdb_handle_escape(escape_data, ped - escape_data); - if (key < 0) - return '\e'; - if (key == 0) - continue; } - break; /* A key to process */ + *pbuf++ = key; + key = kdb_handle_escape(buf, pbuf - buf); + if (key < 0) /* no escape sequence; return first character */ + return buf[0]; + if (key > 0) + return key; } - return key; + + unreachable(); } /* -- cgit v1.2.3-59-g8ed1b From c58ff643763c78bef12874ee39995c9f7f987bc2 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:28 +0100 Subject: kdb: Tweak escape handling for vi users Currently if sequences such as "\ehelp\r" are delivered to the console then the h gets eaten by the escape handling code. Since pressing escape becomes something of a nervous twitch for vi users (and that escape doesn't have much effect at a shell prompt) it is more helpful to emit the 'h' than the '\e'. We don't simply choose to emit the final character for all escape sequences since that will do odd things for unsupported escape sequences (in other words we retain the existing behaviour once we see '\e['). Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-6-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index f794c0ca4557..8bcdded5d61f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -163,8 +163,8 @@ char kdb_getchar(void) *pbuf++ = key; key = kdb_handle_escape(buf, pbuf - buf); - if (key < 0) /* no escape sequence; return first character */ - return buf[0]; + if (key < 0) /* no escape sequence; return best character */ + return buf[pbuf - buf == 2 ? 1 : 0]; if (key > 0) return key; } -- cgit v1.2.3-59-g8ed1b From c3a6cf19e695c8b0a9bf8b5933f863e12d878b7c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 18 Oct 2019 10:31:43 +0100 Subject: export: avoid code duplication in include/linux/export.h include/linux/export.h has lots of code duplication between EXPORT_SYMBOL and EXPORT_SYMBOL_NS. To improve the maintainability and readability, unify the implementation. When the symbol has no namespace, pass the empty string "" to the 'ns' parameter. The drawback of this change is, it grows the code size. When the symbol has no namespace, sym->namespace was previously NULL, but it is now an empty string "". So, it increases 1 byte for every no namespace EXPORT_SYMBOL. A typical kernel configuration has 10K exported symbols, so it increases 10KB in rough estimation. I did not come up with a good idea to refactor it without increasing the code size. I am not sure how big a deal it is, but at least include/linux/export.h looks nicer. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Masahiro Yamada [maennich: rebase on top of 3 fixes for the namespace feature] Signed-off-by: Matthias Maennich Signed-off-by: Jessica Yu --- include/linux/export.h | 91 ++++++++++++++++---------------------------------- kernel/module.c | 2 +- 2 files changed, 29 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/export.h b/include/linux/export.h index 941d075f03d6..201262793369 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -46,7 +46,7 @@ extern struct module __this_module; * absolute relocations that require runtime processing on relocatable * kernels. */ -#define __KSYMTAB_ENTRY_NS(sym, sec) \ +#define __KSYMTAB_ENTRY(sym, sec) \ __ADDRESSABLE(sym) \ asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ " .balign 4 \n" \ @@ -56,33 +56,17 @@ extern struct module __this_module; " .long __kstrtabns_" #sym "- . \n" \ " .previous \n") -#define __KSYMTAB_ENTRY(sym, sec) \ - __ADDRESSABLE(sym) \ - asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ - " .balign 4 \n" \ - "__ksymtab_" #sym ": \n" \ - " .long " #sym "- . \n" \ - " .long __kstrtab_" #sym "- . \n" \ - " .long 0 \n" \ - " .previous \n") - struct kernel_symbol { int value_offset; int name_offset; int namespace_offset; }; #else -#define __KSYMTAB_ENTRY_NS(sym, sec) \ - static const struct kernel_symbol __ksymtab_##sym \ - __attribute__((section("___ksymtab" sec "+" #sym), used)) \ - __aligned(sizeof(void *)) \ - = { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym } - #define __KSYMTAB_ENTRY(sym, sec) \ static const struct kernel_symbol __ksymtab_##sym \ __attribute__((section("___ksymtab" sec "+" #sym), used)) \ __aligned(sizeof(void *)) \ - = { (unsigned long)&sym, __kstrtab_##sym, NULL } + = { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym } struct kernel_symbol { unsigned long value; @@ -93,28 +77,20 @@ struct kernel_symbol { #ifdef __GENKSYMS__ -#define ___EXPORT_SYMBOL(sym,sec) __GENKSYMS_EXPORT_SYMBOL(sym) -#define ___EXPORT_SYMBOL_NS(sym,sec,ns) __GENKSYMS_EXPORT_SYMBOL(sym) +#define ___EXPORT_SYMBOL(sym, sec, ns) __GENKSYMS_EXPORT_SYMBOL(sym) #else -#define ___export_symbol_common(sym, sec) \ +/* For every exported symbol, place a struct in the __ksymtab section */ +#define ___EXPORT_SYMBOL(sym, sec, ns) \ extern typeof(sym) sym; \ __CRC_SYMBOL(sym, sec); \ static const char __kstrtab_##sym[] \ __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ - = #sym \ - -/* For every exported symbol, place a struct in the __ksymtab section */ -#define ___EXPORT_SYMBOL_NS(sym, sec, ns) \ - ___export_symbol_common(sym, sec); \ + = #sym; \ static const char __kstrtabns_##sym[] \ __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ - = #ns; \ - __KSYMTAB_ENTRY_NS(sym, sec) - -#define ___EXPORT_SYMBOL(sym, sec) \ - ___export_symbol_common(sym, sec); \ + = ns; \ __KSYMTAB_ENTRY(sym, sec) #endif @@ -126,8 +102,7 @@ struct kernel_symbol { * be reused in other execution contexts such as the UEFI stub or the * decompressor. */ -#define __EXPORT_SYMBOL_NS(sym, sec, ns) -#define __EXPORT_SYMBOL(sym, sec) +#define __EXPORT_SYMBOL(sym, sec, ns) #elif defined(CONFIG_TRIM_UNUSED_KSYMS) @@ -143,48 +118,38 @@ struct kernel_symbol { #define __ksym_marker(sym) \ static int __ksym_marker_##sym[0] __section(".discard.ksym") __used -#define __EXPORT_SYMBOL(sym, sec) \ - __ksym_marker(sym); \ - __cond_export_sym(sym, sec, __is_defined(__KSYM_##sym)) -#define __cond_export_sym(sym, sec, conf) \ - ___cond_export_sym(sym, sec, conf) -#define ___cond_export_sym(sym, sec, enabled) \ - __cond_export_sym_##enabled(sym, sec) -#define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec) -#define __cond_export_sym_0(sym, sec) /* nothing */ - -#define __EXPORT_SYMBOL_NS(sym, sec, ns) \ +#define __EXPORT_SYMBOL(sym, sec, ns) \ __ksym_marker(sym); \ - __cond_export_ns_sym(sym, sec, ns, __is_defined(__KSYM_##sym)) -#define __cond_export_ns_sym(sym, sec, ns, conf) \ - ___cond_export_ns_sym(sym, sec, ns, conf) -#define ___cond_export_ns_sym(sym, sec, ns, enabled) \ - __cond_export_ns_sym_##enabled(sym, sec, ns) -#define __cond_export_ns_sym_1(sym, sec, ns) ___EXPORT_SYMBOL_NS(sym, sec, ns) -#define __cond_export_ns_sym_0(sym, sec, ns) /* nothing */ + __cond_export_sym(sym, sec, ns, __is_defined(__KSYM_##sym)) +#define __cond_export_sym(sym, sec, ns, conf) \ + ___cond_export_sym(sym, sec, ns, conf) +#define ___cond_export_sym(sym, sec, ns, enabled) \ + __cond_export_sym_##enabled(sym, sec, ns) +#define __cond_export_sym_1(sym, sec, ns) ___EXPORT_SYMBOL(sym, sec, ns) +#define __cond_export_sym_0(sym, sec, ns) /* nothing */ #else -#define __EXPORT_SYMBOL_NS(sym,sec,ns) ___EXPORT_SYMBOL_NS(sym,sec,ns) -#define __EXPORT_SYMBOL(sym,sec) ___EXPORT_SYMBOL(sym,sec) +#define __EXPORT_SYMBOL(sym, sec, ns) ___EXPORT_SYMBOL(sym, sec, ns) #endif /* CONFIG_MODULES */ #ifdef DEFAULT_SYMBOL_NAMESPACE -#undef __EXPORT_SYMBOL -#define __EXPORT_SYMBOL(sym, sec) \ - __EXPORT_SYMBOL_NS(sym, sec, DEFAULT_SYMBOL_NAMESPACE) +#include +#define _EXPORT_SYMBOL(sym, sec) __EXPORT_SYMBOL(sym, sec, __stringify(DEFAULT_SYMBOL_NAMESPACE)) +#else +#define _EXPORT_SYMBOL(sym, sec) __EXPORT_SYMBOL(sym, sec, "") #endif -#define EXPORT_SYMBOL(sym) __EXPORT_SYMBOL(sym, "") -#define EXPORT_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_gpl") -#define EXPORT_SYMBOL_GPL_FUTURE(sym) __EXPORT_SYMBOL(sym, "_gpl_future") -#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL_NS(sym, "", ns) -#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL_NS(sym, "_gpl", ns) +#define EXPORT_SYMBOL(sym) _EXPORT_SYMBOL(sym, "") +#define EXPORT_SYMBOL_GPL(sym) _EXPORT_SYMBOL(sym, "_gpl") +#define EXPORT_SYMBOL_GPL_FUTURE(sym) _EXPORT_SYMBOL(sym, "_gpl_future") +#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", #ns) +#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "_gpl", #ns) #ifdef CONFIG_UNUSED_SYMBOLS -#define EXPORT_UNUSED_SYMBOL(sym) __EXPORT_SYMBOL(sym, "_unused") -#define EXPORT_UNUSED_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_unused_gpl") +#define EXPORT_UNUSED_SYMBOL(sym) _EXPORT_SYMBOL(sym, "_unused") +#define EXPORT_UNUSED_SYMBOL_GPL(sym) _EXPORT_SYMBOL(sym, "_unused_gpl") #else #define EXPORT_UNUSED_SYMBOL(sym) #define EXPORT_UNUSED_SYMBOL_GPL(sym) diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..26c13173da3d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1400,7 +1400,7 @@ static int verify_namespace_is_imported(const struct load_info *info, char *imported_namespace; namespace = kernel_symbol_namespace(sym); - if (namespace) { + if (namespace && namespace[0]) { imported_namespace = get_modinfo(info, "import_ns"); while (imported_namespace) { if (strcmp(namespace, imported_namespace) == 0) -- cgit v1.2.3-59-g8ed1b From a445e940ea686fc60475564009821010eb213be3 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 30 Oct 2019 10:13:13 +0000 Subject: dma-mapping: fix handling of dma-ranges for reserved memory (again) Daniele reported that issue previously fixed in c41f9ea998f3 ("drivers: dma-coherent: Account dma_pfn_offset when used with device tree") reappear shortly after 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool") where fix was accidentally dropped. Lets put fix back in place and respect dma-ranges for reserved memory. Fixes: 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool") Reported-by: Daniele Alessandrelli Tested-by: Daniele Alessandrelli Tested-by: Alexandre Torgue Signed-off-by: Vladimir Murzin Signed-off-by: Christoph Hellwig --- arch/arm/mm/dma-mapping-nommu.c | 2 +- include/linux/dma-mapping.h | 4 ++-- kernel/dma/coherent.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index db9247898300..287ef898a55e 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -35,7 +35,7 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t size, unsigned long attrs) { - void *ret = dma_alloc_from_global_coherent(size, dma_handle); + void *ret = dma_alloc_from_global_coherent(dev, size, dma_handle); /* * dma_alloc_from_global_coherent() may fail because: diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4a1c4fca475a..10918c55003f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -162,7 +162,7 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle); +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle); int dma_release_from_global_coherent(int order, void *vaddr); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); @@ -172,7 +172,7 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) -static inline void *dma_alloc_from_global_coherent(ssize_t size, +static inline void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle) { return NULL; diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 545e3869b0e3..551b0eb7028a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -123,8 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, return ret; } -static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, - ssize_t size, dma_addr_t *dma_handle) +static void *__dma_alloc_from_coherent(struct device *dev, + struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) { int order = get_order(size); unsigned long flags; @@ -143,7 +144,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, /* * Memory was found in the coherent area. */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT); ret = mem->virt_base + (pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); memset(ret, 0, size); @@ -175,17 +176,18 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, if (!mem) return 0; - *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle); return 1; } -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle) { if (!dma_coherent_default_memory) return NULL; - return __dma_alloc_from_coherent(dma_coherent_default_memory, size, - dma_handle); + return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, + dma_handle); } static int __dma_release_from_coherent(struct dma_coherent_mem *mem, -- cgit v1.2.3-59-g8ed1b From 9ff6aa027dbb98755f0265695354f2dd07c0d1ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Oct 2019 14:56:46 -0700 Subject: dma-debug: add a schedule point in debug_dma_dump_mappings() debug_dma_dump_mappings() can take a lot of cpu cycles : lpk43:/# time wc -l /sys/kernel/debug/dma-api/dump 163435 /sys/kernel/debug/dma-api/dump real 0m0.463s user 0m0.003s sys 0m0.459s Let's add a cond_resched() to avoid holding cpu for too long. Signed-off-by: Eric Dumazet Cc: Corentin Labbe Cc: Christoph Hellwig Cc: Marek Szyprowski Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 099002d84f46..4ad74f5987ea 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -420,6 +420,7 @@ void debug_dma_dump_mappings(struct device *dev) } spin_unlock_irqrestore(&bucket->lock, flags); + cond_resched(); } } -- cgit v1.2.3-59-g8ed1b From ca66536845cd55c6a5fccd82694dcc87ed970780 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Sun, 20 Oct 2019 10:33:22 +0530 Subject: kernel: dma-contiguous: mark CMA parameters __initdata/__initconst These parameters are only referenced by __init routine calls during early boot so they should be marked as __initdata and __initconst accordingly. Signed-off-by: Shyam Saini Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 69cfb4345388..daa4e6eefdde 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,10 +42,11 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; -static phys_addr_t size_cmdline = -1; -static phys_addr_t base_cmdline; -static phys_addr_t limit_cmdline; +static const phys_addr_t size_bytes __initconst = + (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +static phys_addr_t size_cmdline __initdata = -1; +static phys_addr_t base_cmdline __initdata; +static phys_addr_t limit_cmdline __initdata; static int __init early_cma(char *p) { -- cgit v1.2.3-59-g8ed1b From f94df9890e98f2090c6a8d70c795134863b70201 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Sep 2019 16:07:45 +0100 Subject: Add wake_up_interruptible_sync_poll_locked() Add a wakeup call for a case whereby the caller already has the waitqueue spinlock held. This can be used by pipes to alter the ring buffer indices and issue a wakeup under the same spinlock. Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) --- include/linux/wait.h | 3 +++ kernel/sched/wait.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index bb7676d396cd..3283c8d02137 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -202,6 +202,7 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); +void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); @@ -229,6 +230,8 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) +#define wake_up_interruptible_sync_poll_locked(x, m) \ + __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define ___wait_cond_timeout(condition) \ ({ \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b4b52361dab7..ba059fbfc53a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -191,6 +191,29 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL_GPL(__wake_up_sync_key); +/** + * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue. + * @wq_head: the waitqueue + * @mode: which threads + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs in that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. + */ +void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, + unsigned int mode, void *key) +{ + __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); + /* * __wake_up_sync - see __wake_up_sync_key() */ -- cgit v1.2.3-59-g8ed1b From f973cce0e4022fa8c969ca5b0c71559125382eb2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 21 Oct 2019 22:38:29 +0200 Subject: kexec: Fix pointer-to-int-cast warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two pointer-to-int-cast warnings when compiling for the 32-bit parisc platform: kernel/kexec_file.c: In function ‘crash_prepare_elf64_headers’: kernel/kexec_file.c:1307:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] phdr->p_vaddr = (Elf64_Addr)_text; ^ kernel/kexec_file.c:1324:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] phdr->p_vaddr = (unsigned long long) __va(mstart); ^ Signed-off-by: Helge Deller --- kernel/kexec_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 79f252af7dee..a2df93948665 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1304,7 +1304,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, if (kernel_map) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_vaddr = (unsigned long) _text; phdr->p_filesz = phdr->p_memsz = _end - _text; phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); ehdr->e_phnum++; @@ -1321,7 +1321,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_offset = mstart; phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_vaddr = (unsigned long) __va(mstart); phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; -- cgit v1.2.3-59-g8ed1b From 7162431dcf72032835d369c8d7b51311df407938 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 16 Oct 2019 13:33:13 +0200 Subject: ftrace: Introduce PERMANENT ftrace_ops flag Livepatch uses ftrace for redirection to new patched functions. It means that if ftrace is disabled, all live patched functions are disabled as well. Toggling global 'ftrace_enabled' sysctl thus affect it directly. It is not a problem per se, because only administrator can set sysctl values, but it still may be surprising. Introduce PERMANENT ftrace_ops flag to amend this. If the FTRACE_OPS_FL_PERMANENT is set on any ftrace ops, the tracing cannot be disabled by disabling ftrace_enabled. Equally, a callback with the flag set cannot be registered if ftrace_enabled is disabled. Link: http://lkml.kernel.org/r/20191016113316.13415-2-mbenes@suse.cz Reviewed-by: Petr Mladek Reviewed-by: Kamalesh Babulal Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 8 ++++++++ Documentation/trace/ftrace.rst | 4 +++- include/linux/ftrace.h | 3 +++ kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 5 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 1fbc69894eed..740bd0224d35 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -170,6 +170,14 @@ FTRACE_OPS_FL_RCU a callback may be executed and RCU synchronization will not protect it. +FTRACE_OPS_FL_PERMANENT + If this is set on any ftrace ops, then the tracing cannot disabled by + writing 0 to the proc sysctl ftrace_enabled. Equally, a callback with + the flag set cannot be registered if ftrace_enabled is 0. + + Livepatch uses it not to lose the function redirection, so the system + stays protected. + Filtering which functions to trace ================================== diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index e3060eedb22d..d2b5657ed33e 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -2976,7 +2976,9 @@ Note, the proc sysctl ftrace_enable is a big on/off switch for the function tracer. By default it is enabled (when function tracing is enabled in the kernel). If it is disabled, all function tracing is disabled. This includes not only the function tracers for ftrace, but -also for any other uses (perf, kprobes, stack tracing, profiling, etc). +also for any other uses (perf, kprobes, stack tracing, profiling, etc). It +cannot be disabled if there is a callback with FTRACE_OPS_FL_PERMANENT set +registered. Please disable this with care. diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8a8cb3c401b2..8385cafe4f9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -142,6 +142,8 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * PID - Is affected by set_ftrace_pid (allows filtering on those pids) * RCU - Set when the ops can only be called when RCU is watching. * TRACE_ARRAY - The ops->private points to a trace_array descriptor. + * PERMANENT - Set when the ops is permanent and should not be affected by + * ftrace_enabled. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -160,6 +162,7 @@ enum { FTRACE_OPS_FL_PID = 1 << 13, FTRACE_OPS_FL_RCU = 1 << 14, FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, + FTRACE_OPS_FL_PERMANENT = 1 << 16, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index bd43537702bd..b552cf2d85f8 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func) ops->fops.func = klp_ftrace_handler; ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; + FTRACE_OPS_FL_IPMODIFY | + FTRACE_OPS_FL_PERMANENT; list_add(&ops->node, &klp_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..89e9128652ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) ops->flags |= FTRACE_OPS_FL_SAVE_REGS; #endif + if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) + return -EBUSY; if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -6754,6 +6756,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static bool is_permanent_ops_registered(void) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PERMANENT) + return true; + } while_for_each_ftrace_op(op); + + return false; +} + int ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -6771,8 +6785,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = !!ftrace_enabled; - if (ftrace_enabled) { /* we are starting ftrace again */ @@ -6783,12 +6795,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); } else { + if (is_permanent_ops_registered()) { + ftrace_enabled = true; + ret = -EBUSY; + goto out; + } + /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; ftrace_shutdown_sysctl(); } + last_ftrace_enabled = !!ftrace_enabled; out: mutex_unlock(&ftrace_lock); return ret; -- cgit v1.2.3-59-g8ed1b From 56144737e67329c9aaed15f942d46a6302e2e3d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2019 09:48:04 -0800 Subject: hrtimer: Annotate lockless access to timer->state syzbot reported various data-race caused by hrtimer_is_queued() reading timer->state. A READ_ONCE() is required there to silence the warning. Also add the corresponding WRITE_ONCE() when timer->state is set. In remove_hrtimer() the hrtimer_is_queued() helper is open coded to avoid loading timer->state twice. KCSAN reported these cases: BUG: KCSAN: data-race in __remove_hrtimer / tcp_pacing_check write to 0xffff8880b2a7d388 of 1 bytes by interrupt on cpu 0: __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991 __run_hrtimer kernel/time/hrtimer.c:1496 [inline] __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593 __do_softirq+0x115/0x33f kernel/softirq.c:292 run_ksoftirqd+0x46/0x60 kernel/softirq.c:603 smpboot_thread_fn+0x37d/0x4a0 kernel/smpboot.c:165 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff8880b2a7d388 of 1 bytes by task 24652 on cpu 1: tcp_pacing_check net/ipv4/tcp_output.c:2235 [inline] tcp_pacing_check+0xba/0x130 net/ipv4/tcp_output.c:2225 tcp_xmit_retransmit_queue+0x32c/0x5a0 net/ipv4/tcp_output.c:3044 tcp_xmit_recovery+0x7c/0x120 net/ipv4/tcp_input.c:3558 tcp_ack+0x17b6/0x3170 net/ipv4/tcp_input.c:3717 tcp_rcv_established+0x37e/0xf50 net/ipv4/tcp_input.c:5696 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2435 release_sock+0x61/0x160 net/core/sock.c:2951 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 BUG: KCSAN: data-race in __remove_hrtimer / __tcp_ack_snd_check write to 0xffff8880a3a65588 of 1 bytes by interrupt on cpu 0: __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991 __run_hrtimer kernel/time/hrtimer.c:1496 [inline] __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 read to 0xffff8880a3a65588 of 1 bytes by task 22891 on cpu 1: __tcp_ack_snd_check+0x415/0x4f0 net/ipv4/tcp_input.c:5265 tcp_ack_snd_check net/ipv4/tcp_input.c:5287 [inline] tcp_rcv_established+0x750/0xf50 net/ipv4/tcp_input.c:5708 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2435 release_sock+0x61/0x160 net/core/sock.c:2951 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 __sys_sendto+0x21f/0x320 net/socket.c:1952 __do_sys_sendto net/socket.c:1964 [inline] __se_sys_sendto net/socket.c:1960 [inline] __x64_sys_sendto+0x89/0xb0 net/socket.c:1960 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24652 Comm: syz-executor.3 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 [ tglx: Added comments ] Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191106174804.74723-1-edumazet@google.com --- include/linux/hrtimer.h | 14 ++++++++++---- kernel/time/hrtimer.c | 11 +++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1b9a51a1bccb..1f98b52118f0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -456,12 +456,18 @@ extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); -/* - * Helper function to check, whether the timer is on one of the queues +/** + * hrtimer_is_queued = check, whether the timer is on one of the queues + * @timer: Timer to check + * + * Returns: True if the timer is queued, false otherwise + * + * The function can be used lockless, but it gives only a current snapshot. */ -static inline int hrtimer_is_queued(struct hrtimer *timer) +static inline bool hrtimer_is_queued(struct hrtimer *timer) { - return timer->state & HRTIMER_STATE_ENQUEUED; + /* The READ_ONCE pairs with the update functions of timer->state */ + return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); } /* diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 65605530ee34..7f31932216a1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -966,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -988,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -1013,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { int reprogram; /* -- cgit v1.2.3-59-g8ed1b From 3f6ec871e1c2b360aaf022e90bb99dcc016b3874 Mon Sep 17 00:00:00 2001 From: Amit Kucheria Date: Mon, 21 Oct 2019 17:45:12 +0530 Subject: cpufreq: Initialize the governors in core_initcall Initialize the cpufreq governors earlier to allow for earlier performance control during the boot process. Signed-off-by: Amit Kucheria Acked-by: Viresh Kumar Reviewed-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/b98eae9b44eb2f034d7f5d12a161f5f831be1eb7.1571656015.git.amit.kucheria@linaro.org --- drivers/cpufreq/cpufreq_conservative.c | 2 +- drivers/cpufreq/cpufreq_ondemand.c | 2 +- drivers/cpufreq/cpufreq_performance.c | 2 +- drivers/cpufreq/cpufreq_powersave.c | 2 +- drivers/cpufreq/cpufreq_userspace.c | 2 +- kernel/sched/cpufreq_schedutil.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index b66e81c06a57..737ff3b9c2c0 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -346,7 +346,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) return CPU_FREQ_GOV_CONSERVATIVE; } -fs_initcall(cpufreq_gov_dbs_init); +core_initcall(cpufreq_gov_dbs_init); #else module_init(cpufreq_gov_dbs_init); #endif diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index dced033875bf..82a4d37ddecb 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -483,7 +483,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) return CPU_FREQ_GOV_ONDEMAND; } -fs_initcall(cpufreq_gov_dbs_init); +core_initcall(cpufreq_gov_dbs_init); #else module_init(cpufreq_gov_dbs_init); #endif diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index aaa04dfcacd9..def9afe0f5b8 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -50,5 +50,5 @@ MODULE_AUTHOR("Dominik Brodowski "); MODULE_DESCRIPTION("CPUfreq policy governor 'performance'"); MODULE_LICENSE("GPL"); -fs_initcall(cpufreq_gov_performance_init); +core_initcall(cpufreq_gov_performance_init); module_exit(cpufreq_gov_performance_exit); diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index c143dc237d87..1ae66019eb83 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -43,7 +43,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) return &cpufreq_gov_powersave; } -fs_initcall(cpufreq_gov_powersave_init); +core_initcall(cpufreq_gov_powersave_init); #else module_init(cpufreq_gov_powersave_init); #endif diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index cbd81c58cb8f..b43e7cd502c5 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -147,7 +147,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) return &cpufreq_gov_userspace; } -fs_initcall(cpufreq_gov_userspace_init); +core_initcall(cpufreq_gov_userspace_init); #else module_init(cpufreq_gov_userspace_init); #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 86800b4d5453..322ca8860f54 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -915,7 +915,7 @@ static int __init sugov_register(void) { return cpufreq_register_governor(&schedutil_gov); } -fs_initcall(sugov_register); +core_initcall(sugov_register); #ifdef CONFIG_ENERGY_MODEL extern bool sched_energy_update; -- cgit v1.2.3-59-g8ed1b From acaade1af3587132e7ea585f470a95261e14f60c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 09:57:09 +0100 Subject: dma-direct: remove __dma_direct_free_pages We can just call dma_free_contiguous directly instead of wrapping it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- include/linux/dma-direct.h | 1 - kernel/dma/direct.c | 11 +++-------- kernel/dma/remap.c | 4 ++-- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index adf993a3bd58..dec3b3bb121d 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -68,6 +68,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); -void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page); int dma_direct_supported(struct device *dev, u64 mask); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8402b29c280f..a7a2739fb747 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -153,7 +153,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, * so log an error and fail. */ dev_info(dev, "Rejecting highmem page from CMA.\n"); - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); return NULL; } @@ -175,11 +175,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, return ret; } -void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) -{ - dma_free_contiguous(dev, page, size); -} - void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { @@ -188,7 +183,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ - __dma_direct_free_pages(dev, size, cpu_addr); + dma_free_contiguous(dev, cpu_addr, size); return; } @@ -198,7 +193,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && dma_alloc_need_uncached(dev, attrs)) cpu_addr = cached_kernel_address(cpu_addr); - __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); + dma_free_contiguous(dev, virt_to_page(cpu_addr), size); } void *dma_direct_alloc(struct device *dev, size_t size, diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index c00b9258fa6a..fb1e50c2d48a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -238,7 +238,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); return ret; } @@ -256,7 +256,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, struct page *page = pfn_to_page(__phys_to_pfn(phys)); vunmap(vaddr); - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); } } -- cgit v1.2.3-59-g8ed1b From 4e1003aa56a7d60ddb048e43a7a51368fcfe36af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 09:57:32 +0100 Subject: dma-direct: remove the dma_handle argument to __dma_direct_alloc_pages The argument isn't used anywhere, so stop passing it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- include/linux/dma-direct.h | 2 +- kernel/dma/direct.c | 4 ++-- kernel/dma/remap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index dec3b3bb121d..ff3d5edc44b9 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -67,6 +67,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); + gfp_t gfp, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a7a2739fb747..724c282dd943 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -83,7 +83,7 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) } struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) + gfp_t gfp, unsigned long attrs) { size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); @@ -131,7 +131,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); + page = __dma_direct_alloc_pages(dev, size, gfp, attrs); if (!page) return NULL; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index fb1e50c2d48a..90d5ce77c189 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -226,7 +226,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, goto done; } - page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); + page = __dma_direct_alloc_pages(dev, size, flags, attrs); if (!page) return NULL; -- cgit v1.2.3-59-g8ed1b From 714641c3670cdc75371a7ff5bdfd5e9a170c7ffd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:25:46 -0500 Subject: ftrace: Separate out the copying of a ftrace_hash from __ftrace_hash_move() Most of the functionality of __ftrace_hash_move() can be reused, but not all of it. That is, __ftrace_hash_move() is used to simply make a new hash from an existing one, using the same size as the original. Creating a dup_hash(), where we can specify a new size will be useful when we want to create a hash with a default size, or simply copy the old one. Signed-off-by: Steven Rostedt (VMWare) --- kernel/trace/ftrace.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89e9128652ef..76e5de8c7822 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1372,23 +1372,15 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static struct ftrace_hash * -__ftrace_hash_move(struct ftrace_hash *src) +static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) { struct ftrace_func_entry *entry; - struct hlist_node *tn; - struct hlist_head *hhd; struct ftrace_hash *new_hash; - int size = src->count; + struct hlist_head *hhd; + struct hlist_node *tn; int bits = 0; int i; - /* - * If the new source is empty, just return the empty_hash. - */ - if (ftrace_hash_empty(src)) - return EMPTY_HASH; - /* * Make the hash size about 1/2 the # found */ @@ -1413,10 +1405,23 @@ __ftrace_hash_move(struct ftrace_hash *src) __add_hash_entry(new_hash, entry); } } - return new_hash; } +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) +{ + int size = src->count; + + /* + * If the new source is empty, just return the empty_hash. + */ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return dup_hash(src, size); +} + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) -- cgit v1.2.3-59-g8ed1b From 7e16f581a81759bafea04d049134b32d1a881226 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:26:46 -0500 Subject: ftrace: Separate out functionality from ftrace_location_range() Create a new function called lookup_rec() from the functionality of ftrace_location_range(). The difference between lookup_rec() is that it returns the record that it finds, where as ftrace_location_range() returns only if it found a match or not. The lookup_rec() is static, and can be used for new functionality where ftrace needs to find a record of a specific address. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76e5de8c7822..b0e7f03919de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,26 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } +static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec = NULL; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + } + return rec; +} + /** * ftrace_location_range - return the first address of a traced location * if it touches the given ip range @@ -1555,23 +1575,11 @@ static int ftrace_cmp_recs(const void *a, const void *b) */ unsigned long ftrace_location_range(unsigned long start, unsigned long end) { - struct ftrace_page *pg; struct dyn_ftrace *rec; - struct dyn_ftrace key; - key.ip = start; - key.flags = end; /* overload flags, as it is unsigned long */ - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || - start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) - continue; - rec = bsearch(&key, pg->records, pg->index, - sizeof(struct dyn_ftrace), - ftrace_cmp_recs); - if (rec) - return rec->ip; - } + rec = lookup_rec(start, end); + if (rec) + return rec->ip; return 0; } -- cgit v1.2.3-59-g8ed1b From 153bedbac2ebd475e1c7c2d2fa0c042f5525927d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:55 +0100 Subject: irq_work: Convert flags to atomic_t We need to convert flags to atomic_t in order to later fix an ordering issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or(). Also clarify the nature of those flags. [ mingo: Converted two more usage site the original patch missed. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/irq_work.h | 10 +++++++--- kernel/bpf/stackmap.c | 2 +- kernel/irq_work.c | 18 +++++++++--------- kernel/printk/printk.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 5 files changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index b11fcdfd0770..02da997ad12c 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -22,7 +22,7 @@ #define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) struct irq_work { - unsigned long flags; + atomic_t flags; struct llist_node llnode; void (*func)(struct irq_work *); }; @@ -30,11 +30,15 @@ struct irq_work { static inline void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) { - work->flags = 0; + atomic_set(&work->flags, 0); work->func = func; } -#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), } +#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \ + .flags = ATOMIC_INIT(0), \ + .func = (_f) \ +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..4d31284095e2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (in_nmi()) { work = this_cpu_ptr(&up_read_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index d42acaf81886..df0dbf4d859b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, oflags, nflags; + int flags, oflags, nflags; /* * Start with our best wish as a premise but only trust any * flag value after cmpxchg() result. */ - flags = work->flags & ~IRQ_WORK_PENDING; + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; for (;;) { nflags = flags | IRQ_WORK_CLAIMED; - oflags = cmpxchg(&work->flags, flags, nflags); + oflags = atomic_cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; if (oflags & IRQ_WORK_PENDING) @@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { + if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); @@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - unsigned long flags; + int flags; BUG_ON(!irqs_disabled()); @@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = work->flags & ~IRQ_WORK_PENDING; - xchg(&work->flags, flags); + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; + atomic_xchg(&work->flags, flags); work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } @@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (work->flags & IRQ_WORK_BUSY) + while (atomic_read(&work->flags) & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ca65327a6de8..865727373a3b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, + .flags = ATOMIC_INIT(IRQ_WORK_LAZY), }; void wake_up_klogd(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..ff467a4e2639 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) return -EBUSY; /* Add the current task, which is the target of sending signal, -- cgit v1.2.3-59-g8ed1b From 25269871db1ad0cbbaafd5098cbdb40c8db4ccb9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:56 +0100 Subject: irq_work: Fix irq_work_claim() memory ordering When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just return and don't raise a new IPI. We expect the destination to see and handle our latest updades thanks to the pairing atomic_xchg() in irq_work_run_list(). But cmpxchg() doesn't guarantee a full memory barrier upon failure. So it's possible that the destination misses our latest updates. So use atomic_fetch_or() instead that is unconditionally fully ordered and also performs exactly what we want here and simplify the code. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index df0dbf4d859b..255454a48346 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - int flags, oflags, nflags; + int oflags; + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* - * Start with our best wish as a premise but only trust any - * flag value after cmpxchg() result. + * If the work is already pending, no need to raise the IPI. + * The pairing atomic_xchg() in irq_work_run() makes sure + * everything we did before is visible. */ - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; - for (;;) { - nflags = flags | IRQ_WORK_CLAIMED; - oflags = atomic_cmpxchg(&work->flags, flags, nflags); - if (oflags == flags) - break; - if (oflags & IRQ_WORK_PENDING) - return false; - flags = oflags; - cpu_relax(); - } - + if (oflags & IRQ_WORK_PENDING) + return false; return true; } -- cgit v1.2.3-59-g8ed1b From feb4a51323babe13315c3b783ea7f1cf25368918 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:57 +0100 Subject: irq_work: Slightly simplify IRQ_WORK_PENDING clearing Instead of fetching the value of flags and perform an xchg() to clear a bit, just use atomic_fetch_andnot() that is more suitable to do that job in one operation while keeping the full ordering. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-4-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 255454a48346..49c53f80a13a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work) oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* * If the work is already pending, no need to raise the IPI. - * The pairing atomic_xchg() in irq_work_run() makes sure + * The pairing atomic_fetch_andnot() in irq_work_run() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) @@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - int flags; BUG_ON(!irqs_disabled()); @@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list) llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, llnode) { + int flags; /* * Clear the PENDING bit, after this point the @work * can be re-used. @@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; - atomic_xchg(&work->flags, flags); + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); work->func(work); /* -- cgit v1.2.3-59-g8ed1b From 34dc0ea6bc960f1f57b2148f01a3f4da23f87013 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 11:01:37 +0100 Subject: dma-direct: provide mmap and get_sgtable method overrides For dma-direct we know that the DMA address is an encoding of the physical address that we can trivially decode. Use that fact to provide implementations that do not need the arch_dma_coherent_to_pfn architecture hook. Note that we still can only support mmap of non-coherent memory only if the architecture provides a way to set an uncached bit in the page tables. This must be true for architectures that use the generic remap helpers, but other architectures can also manually select it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm/mm/dma-mapping.c | 6 ---- arch/arm64/Kconfig | 1 - arch/ia64/Kconfig | 2 +- arch/ia64/kernel/dma-mapping.c | 6 ---- arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 4 +-- arch/mips/mm/dma-noncoherent.c | 6 ---- arch/powerpc/platforms/Kconfig.cputype | 1 - include/linux/dma-direct.h | 7 ++++ include/linux/dma-noncoherent.h | 2 -- kernel/dma/Kconfig | 12 +++++-- kernel/dma/direct.c | 59 ++++++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 45 ++++---------------------- kernel/dma/remap.c | 6 ---- 16 files changed, 85 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 8383155c8c82..4d7b671c8ff4 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,7 +6,6 @@ config ARC def_bool y select ARC_TIMERS - select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8a50efb559f3..80e795aacd3a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -7,7 +7,6 @@ config ARM select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 7d042d5c43e3..f3cbeba7f9cb 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2346,12 +2346,6 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, size, dir); } -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return dma_to_pfn(dev, dma_addr); -} - void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3f047afb982c..57606307fe34 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -12,7 +12,6 @@ config ARM64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_FAST_MULTIPLIER diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 16714477eef4..bab7cd878464 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -33,7 +33,7 @@ config IA64 select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING - select ARCH_HAS_DMA_COHERENT_TO_PFN + select DMA_NONCOHERENT_MMAP select ARCH_HAS_SYNC_DMA_FOR_CPU select VIRT_TO_BUS select GENERIC_IRQ_PROBE diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c index 4a3262795890..09ef9ce9988d 100644 --- a/arch/ia64/kernel/dma-mapping.c +++ b/arch/ia64/kernel/dma-mapping.c @@ -19,9 +19,3 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, { dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); } - -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return page_to_pfn(virt_to_page(cpu_addr)); -} diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index c9c4be822456..261c26df1c9f 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -4,7 +4,6 @@ config MICROBLAZE select ARCH_32BIT_OFF_T select ARCH_NO_SWAP select ARCH_HAS_BINFMT_FLAT if !MMU - select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a0bd9bdb5f83..248d39b8a160 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1134,9 +1134,9 @@ config DMA_NONCOHERENT select ARCH_HAS_DMA_WRITE_COMBINE select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_UNCACHED_SEGMENT - select NEED_DMA_MAP_STATE - select ARCH_HAS_DMA_COHERENT_TO_PFN + select DMA_NONCOHERENT_MMAP select DMA_NONCOHERENT_CACHE_SYNC + select NEED_DMA_MAP_STATE config SYS_HAS_EARLY_PRINTK bool diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index 1d4d57dd9acf..fcf6d3eaac66 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -59,12 +59,6 @@ void *cached_kernel_address(void *addr) return __va(addr) - UNCAC_BASE; } -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return page_to_pfn(virt_to_page(cached_kernel_address(cpu_addr))); -} - static inline void dma_sync_virt(void *addr, size_t size, enum dma_data_direction dir) { diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 12543e53fa96..303752f97c19 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -459,7 +459,6 @@ config NOT_COHERENT_CACHE bool depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE - select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index ff3d5edc44b9..bcd953fb1f5a 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -68,5 +68,12 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs); +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_can_mmap(struct device *dev); +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index dd3de6d88fc0..e30fca1f1b12 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -41,8 +41,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr); #ifdef CONFIG_MMU /* diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 73c5c2b8e824..4c103a24e380 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -51,9 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL config ARCH_HAS_DMA_PREP_COHERENT bool -config ARCH_HAS_DMA_COHERENT_TO_PFN - bool - config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool @@ -68,9 +65,18 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +# +# Should be selected if we can mmap non-coherent mappings to userspace. +# The only thing that is really required is a way to set an uncached bit +# in the pagetables +# +config DMA_NONCOHERENT_MMAP + bool + config DMA_REMAP depends on MMU select GENERIC_ALLOCATOR + select DMA_NONCOHERENT_MMAP bool config DMA_DIRECT_REMAP diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 724c282dd943..58beaa9ddd27 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev, return phys_to_dma(dev, phys); } +static inline struct page *dma_direct_to_page(struct device *dev, + dma_addr_t dma_addr) +{ + return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr))); +} + u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); @@ -379,6 +385,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, } EXPORT_SYMBOL(dma_direct_map_resource); +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + struct page *page = dma_direct_to_page(dev, dma_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; +} + +#ifdef CONFIG_MMU +bool dma_direct_can_mmap(struct device *dev) +{ + return dev_is_dma_coherent(dev) || + IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP); +} + +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr)); + int ret = -ENXIO; + + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +} +#else /* CONFIG_MMU */ +bool dma_direct_can_mmap(struct device *dev) +{ + return false; +} + +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + return -ENXIO; +} +#endif /* CONFIG_MMU */ + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d9334f31a5af..12ff766ec1fa 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -112,24 +112,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { - struct page *page; + struct page *page = virt_to_page(cpu_addr); int ret; - if (!dev_is_dma_coherent(dev)) { - unsigned long pfn; - - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) - return -ENXIO; - - /* If the PFN is not valid, we do not have a struct page */ - pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); - if (!pfn_valid(pfn)) - return -ENXIO; - page = pfn_to_page(pfn); - } else { - page = virt_to_page(cpu_addr); - } - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); @@ -154,7 +139,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_is_direct(ops)) - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, + return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) return -ENXIO; @@ -192,7 +177,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; - unsigned long pfn; int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); @@ -203,19 +187,8 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, if (off >= count || user_count > count - off) return -ENXIO; - if (!dev_is_dma_coherent(dev)) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) - return -ENXIO; - - /* If the PFN is not valid, we do not have a struct page */ - pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); - if (!pfn_valid(pfn)) - return -ENXIO; - } else { - pfn = page_to_pfn(virt_to_page(cpu_addr)); - } - - return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); #else return -ENXIO; @@ -233,12 +206,8 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) { - return IS_ENABLED(CONFIG_MMU) && - (dev_is_dma_coherent(dev) || - IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)); - } - + if (dma_is_direct(ops)) + return dma_direct_can_mmap(dev); return ops->mmap != NULL; } EXPORT_SYMBOL_GPL(dma_can_mmap); @@ -263,7 +232,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_is_direct(ops)) - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, + return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) return -ENXIO; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 90d5ce77c189..3c49499ee6b0 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -259,10 +259,4 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_free_contiguous(dev, page, size); } } - -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return __phys_to_pfn(dma_to_phys(dev, dma_addr)); -} #endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit v1.2.3-59-g8ed1b From 3acac065508f6cc60ac9d3e4b7c6cc37fd91d531 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 11:06:32 +0100 Subject: dma-mapping: merge the generic remapping helpers into dma-direct Integrate the generic dma remapping implementation into the main flow. This prepares for architectures like xtensa that use an uncached segment for pages in the kernel mapping, but can also remap highmem from CMA. To simplify that implementation we now always deduct the page from the physical address via the DMA address instead of the virtual address. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 60 ++++++++++++++++++++++++++++++++++++++++++----------- kernel/dma/remap.c | 49 ------------------------------------------- 2 files changed, 48 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 58beaa9ddd27..22a2e0833862 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,15 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs) && + !gfpflags_allow_blocking(gfp)) { + ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + if (!ret) + return NULL; + goto done; + } + page = __dma_direct_alloc_pages(dev, size, gfp, attrs); if (!page) return NULL; @@ -146,9 +156,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); - *dma_handle = phys_to_dma(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ - return page; + ret = page; + goto done; + } + + if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs)) || + (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { + /* remove any dirty cache lines on the kernel alias */ + arch_dma_prep_coherent(page, PAGE_ALIGN(size)); + + /* create a coherent mapping */ + ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), + dma_pgprot(dev, PAGE_KERNEL, attrs), + __builtin_return_address(0)); + if (!ret) { + dma_free_contiguous(dev, page, size); + return ret; + } + + memset(ret, 0, size); + goto done; } if (PageHighMem(page)) { @@ -164,12 +193,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { + if (force_dma_unencrypted(dev)) set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - } else { - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - } + memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && @@ -177,7 +203,11 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, arch_dma_prep_coherent(page, size); ret = uncached_kernel_address(ret); } - +done: + if (force_dma_unencrypted(dev)) + *dma_handle = __phys_to_dma(dev, page_to_phys(page)); + else + *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; } @@ -193,19 +223,24 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + return; + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - dma_alloc_need_uncached(dev, attrs)) - cpu_addr = cached_kernel_address(cpu_addr); - dma_free_contiguous(dev, virt_to_page(cpu_addr), size); + if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) + vunmap(cpu_addr); + + dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); @@ -215,6 +250,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 3c49499ee6b0..d47bd40fc0f5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -210,53 +210,4 @@ bool dma_free_from_pool(void *start, size_t size) gen_pool_free(atomic_pool, (unsigned long)start, size); return true; } - -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flags, unsigned long attrs) -{ - struct page *page = NULL; - void *ret; - - size = PAGE_ALIGN(size); - - if (!gfpflags_allow_blocking(flags)) { - ret = dma_alloc_from_pool(size, &page, flags); - if (!ret) - return NULL; - goto done; - } - - page = __dma_direct_alloc_pages(dev, size, flags, attrs); - if (!page) - return NULL; - - /* remove any dirty cache lines on the kernel alias */ - arch_dma_prep_coherent(page, size); - - /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, - dma_pgprot(dev, PAGE_KERNEL, attrs), - __builtin_return_address(0)); - if (!ret) { - dma_free_contiguous(dev, page, size); - return ret; - } - - memset(ret, 0, size); -done: - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - return ret; -} - -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { - phys_addr_t phys = dma_to_phys(dev, dma_handle); - struct page *page = pfn_to_page(__phys_to_pfn(phys)); - - vunmap(vaddr); - dma_free_contiguous(dev, page, size); - } -} #endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit v1.2.3-59-g8ed1b From d3694f30732fd2a334b93f087033c5a5836f7aba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2019 12:32:04 -0700 Subject: dma-debug: reorder struct dma_debug_entry fields Move all fields used during exact match lookups to the first cache line. This makes debug_dma_mapping_error() and friends about 50% faster. Since it removes two 32bit holes, force a cacheline alignment on struct dma_debug_entry. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 4ad74f5987ea..a5b85dabfb8c 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -54,33 +54,33 @@ enum map_err_types { * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping * @list: node on pre-allocated free_entries list * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent - * @type: single, page, sg, coherent - * @pfn: page frame of the start address - * @offset: offset of mapping relative to pfn * @size: length of the mapping + * @type: single, page, sg, coherent * @direction: enum dma_data_direction * @sg_call_ents: 'nents' from dma_map_sg * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn * @map_err_type: track whether dma_mapping_error() was checked * @stacktrace: support backtraces when a violation is detected */ struct dma_debug_entry { struct list_head list; struct device *dev; - int type; - unsigned long pfn; - size_t offset; u64 dev_addr; u64 size; + int type; int direction; int sg_call_ents; int sg_mapped_ents; + unsigned long pfn; + size_t offset; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE unsigned int stack_len; unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; #endif -}; +} ____cacheline_aligned_in_smp; typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); -- cgit v1.2.3-59-g8ed1b From 5e76f564572b85735de4b75a5e73b514be2562be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2019 11:48:44 -0700 Subject: dma-debug: increase HASH_SIZE With modern NIC, it is not unusual having about ~256,000 active dma mappings and a hash size of 1024 buckets is too small. Forcing full cache line per bucket does not seem useful, especially now that we have contention on free_entries_lock for allocations and freeing of entries. Better use the space to fit more buckets. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a5b85dabfb8c..004496654aaa 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -27,7 +27,7 @@ #include -#define HASH_SIZE 1024ULL +#define HASH_SIZE 16384ULL #define HASH_FN_SHIFT 13 #define HASH_FN_MASK (HASH_SIZE - 1) @@ -87,7 +87,7 @@ typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); struct hash_bucket { struct list_head list; spinlock_t lock; -} ____cacheline_aligned_in_smp; +}; /* Hash list to save the allocated dma addresses */ static struct hash_bucket dma_entry_hash[HASH_SIZE]; -- cgit v1.2.3-59-g8ed1b From 9a066357184485784f782719093ff804d05b85db Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:52 +0900 Subject: kheaders: remove unneeded 'cat' command piped to 'head' / 'tail' The 'head' and 'tail' commands can take a file path directly. So, you do not need to run 'cat'. cat kernel/kheaders.md5 | head -1 ... is equivalent to: head -1 kernel/kheaders.md5 and the latter saves forking one process. While I was here, I replaced 'head -1' with 'head -n 1'. I also replaced '==' with '=' since we do not have a good reason to use the bashism. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 5a0fc0b0403a..b8054b0d5010 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -41,10 +41,10 @@ obj_files_md5="$(find $dir_list -name "*.h" | this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && - [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && - [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + [ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] && + [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] && + [ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && + [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then exit fi -- cgit v1.2.3-59-g8ed1b From 0e11773e76098729552b750ccff79374d1e62002 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:53 +0900 Subject: kheaders: optimize md5sum calculation for in-tree builds This script computes md5sum of headers in srctree and in objtree. However, when we are building in-tree, we know the srctree and the objtree are the same. That is, we end up with the same computation twice. In fact, the first two lines of kernel/kheaders.md5 are always the same for in-tree builds. Unify the two md5sum calculations. For in-tree builds ($building_out_of_srctree is empty), we check only two directories, "include", and "arch/$SRCARCH/include". For out-of-tree builds ($building_out_of_srctree is 1), we check 4 directories, "$srctree/include", "$srctree/arch/$SRCARCH/include", "include", and "arch/$SRCARCH/include" since we know they are all different. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index b8054b0d5010..6ff86e62787f 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -21,29 +21,30 @@ arch/$SRCARCH/include/ # Uncomment it for debugging. # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter -# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter +# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter + +all_dirs= +if [ "$building_out_of_srctree" ]; then + for d in $dir_list; do + all_dirs="$all_dirs $srctree/$d" + done +fi +all_dirs="$all_dirs $dir_list" # include/generated/compile.h is ignored because it is touched even when none # of the source files changed. This causes pointless regeneration, so let us # ignore them for md5 calculation. -pushd $srctree > /dev/null -src_files_md5="$(find $dir_list -name "*.h" | - grep -v "include/generated/compile.h" | - grep -v "include/generated/autoconf.h" | - xargs ls -l | md5sum | cut -d ' ' -f1)" -popd > /dev/null -obj_files_md5="$(find $dir_list -name "*.h" | - grep -v "include/generated/compile.h" | - grep -v "include/generated/autoconf.h" | +headers_md5="$(find $all_dirs -name "*.h" | + grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | xargs ls -l | md5sum | cut -d ' ' -f1)" + # Any changes to this script will also cause a rebuild of the archive. this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && - [ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] && - [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] && - [ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && + [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] && + [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then exit fi @@ -79,8 +80,7 @@ find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ --owner=0 --group=0 --numeric-owner --no-recursion \ -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null -echo "$src_files_md5" > kernel/kheaders.md5 -echo "$obj_files_md5" >> kernel/kheaders.md5 +echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 -- cgit v1.2.3-59-g8ed1b From ea79e5168be644fdaf7d4e6a73eceaf07b3da76a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:54 +0900 Subject: kheaders: optimize header copy for in-tree builds This script copies headers by the cpio command twice; first from srctree, and then from objtree. However, when we building in-tree, we know the srctree and the objtree are the same. That is, all the headers copied by the first cpio are overwritten by the second one. Skip the first cpio when we are building in-tree. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 6ff86e62787f..0f7752dd93a6 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -56,14 +56,16 @@ fi rm -rf $cpio_dir mkdir $cpio_dir -pushd $srctree > /dev/null -for f in $dir_list; - do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir -popd > /dev/null +if [ "$building_out_of_srctree" ]; then + pushd $srctree > /dev/null + for f in $dir_list + do find "$f" -name "*.h"; + done | cpio --quiet -pd $cpio_dir + popd > /dev/null +fi -# The second CPIO can complain if files already exist which can -# happen with out of tree builds. Just silence CPIO for now. +# The second CPIO can complain if files already exist which can happen with out +# of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 -- cgit v1.2.3-59-g8ed1b From 1463f74f492eea7191f0178e01f3d38371a48210 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:55 +0900 Subject: kheaders: remove the last bashism to allow sh to run it 'pushd' ... 'popd' is the last bash-specific code in this script. One way to avoid it is to run the code in a sub-shell. With that addressed, you can run this script with sh. I replaced $(BASH) with $(CONFIG_SHELL), and I changed the hashbang to #!/bin/sh. Signed-off-by: Masahiro Yamada --- kernel/Makefile | 2 +- kernel/gen_kheaders.sh | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..42557f251fea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -128,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz - cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@ + cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 0f7752dd93a6..dc5744b93f8c 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # SPDX-License-Identifier: GPL-2.0 # This script generates an archive consisting of kernel headers @@ -57,11 +57,12 @@ rm -rf $cpio_dir mkdir $cpio_dir if [ "$building_out_of_srctree" ]; then - pushd $srctree > /dev/null - for f in $dir_list - do find "$f" -name "*.h"; - done | cpio --quiet -pd $cpio_dir - popd > /dev/null + ( + cd $srctree + for f in $dir_list + do find "$f" -name "*.h"; + done | cpio --quiet -pd $cpio_dir + ) fi # The second CPIO can complain if files already exist which can happen with out -- cgit v1.2.3-59-g8ed1b From f276031b4e2f4c961ed6d8a42f0f0124ccac2e09 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:56 +0900 Subject: kheaders: explain why include/config/autoconf.h is excluded from md5sum This comment block explains why include/generated/compile.h is omitted, but nothing about include/generated/autoconf.h, which might be more difficult to understand. Add more comments. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index dc5744b93f8c..e13ca842eb7e 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -32,8 +32,15 @@ fi all_dirs="$all_dirs $dir_list" # include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. This causes pointless regeneration, so let us -# ignore them for md5 calculation. +# of the source files changed. +# +# When Kconfig regenerates include/generated/autoconf.h, its timestamp is +# updated, but the contents might be still the same. When any CONFIG option is +# changed, Kconfig touches the corresponding timestamp file include/config/*.h. +# Hence, the md5sum detects the configuration change anyway. We do not need to +# check include/generated/autoconf.h explicitly. +# +# Ignore them for md5 calculation to avoid pointless regeneration. headers_md5="$(find $all_dirs -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | -- cgit v1.2.3-59-g8ed1b From c1d51f684c72b5eb2aecbbd47be3a2977a2dc903 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Nov 2019 15:25:12 +0100 Subject: cpuidle: Use nanoseconds as the unit of time Currently, the cpuidle subsystem uses microseconds as the unit of time which (among other things) causes the idle loop to incur some integer division overhead for no clear benefit. In order to allow cpuidle to measure time in nanoseconds, add two new fields, exit_latency_ns and target_residency_ns, to represent the exit latency and target residency of an idle state in nanoseconds, respectively, to struct cpuidle_state and initialize them with the help of the corresponding values in microseconds provided by drivers. Additionally, change cpuidle_governor_latency_req() to return the idle state exit latency constraint in nanoseconds. Also meeasure idle state residency (last_residency_ns in struct cpuidle_device and time_ns in struct cpuidle_driver) in nanoseconds and update the cpuidle core and governors accordingly. However, the menu governor still computes typical intervals in microseconds to avoid integer overflows. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Doug Smythies Tested-by: Doug Smythies --- drivers/cpuidle/cpuidle.c | 36 +++++----- drivers/cpuidle/driver.c | 29 ++++++--- drivers/cpuidle/governor.c | 7 +- drivers/cpuidle/governors/haltpoll.c | 7 +- drivers/cpuidle/governors/ladder.c | 25 +++---- drivers/cpuidle/governors/menu.c | 123 ++++++++++++++++------------------- drivers/cpuidle/governors/teo.c | 76 ++++++++++------------ drivers/cpuidle/poll_state.c | 2 + drivers/cpuidle/sysfs.c | 20 +++++- include/linux/cpuidle.h | 8 ++- kernel/sched/idle.c | 2 +- 11 files changed, 174 insertions(+), 161 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 44ae39f2b47a..bf9b030cd7e1 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -75,24 +75,24 @@ int cpuidle_play_dead(void) static int find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, - unsigned int max_latency, + u64 max_latency_ns, unsigned int forbidden_flags, bool s2idle) { - unsigned int latency_req = 0; + u64 latency_req = 0; int i, ret = 0; for (i = 1; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; if (dev->states_usage[i].disable || - s->exit_latency <= latency_req || - s->exit_latency > max_latency || + s->exit_latency_ns <= latency_req || + s->exit_latency_ns > max_latency_ns || (s->flags & forbidden_flags) || (s2idle && !s->enter_s2idle)) continue; - latency_req = s->exit_latency; + latency_req = s->exit_latency_ns; ret = i; } return ret; @@ -124,7 +124,7 @@ void cpuidle_use_deepest_state(bool enable) int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - return find_deepest_state(drv, dev, UINT_MAX, 0, false); + return find_deepest_state(drv, dev, U64_MAX, 0, false); } #ifdef CONFIG_SUSPEND @@ -180,7 +180,7 @@ int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) * that interrupts won't be enabled when it exits and allows the tick to * be frozen safely. */ - index = find_deepest_state(drv, dev, UINT_MAX, 0, true); + index = find_deepest_state(drv, dev, U64_MAX, 0, true); if (index > 0) enter_s2idle_proper(drv, dev, index); @@ -209,7 +209,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * CPU as a broadcast timer, this call may fail if it is not available. */ if (broadcast && tick_broadcast_enter()) { - index = find_deepest_state(drv, dev, target_state->exit_latency, + index = find_deepest_state(drv, dev, target_state->exit_latency_ns, CPUIDLE_FLAG_TIMER_STOP, false); if (index < 0) { default_idle_call(); @@ -247,7 +247,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, local_irq_enable(); if (entered_state >= 0) { - s64 diff, delay = drv->states[entered_state].exit_latency; + s64 diff, delay = drv->states[entered_state].exit_latency_ns; int i; /* @@ -255,15 +255,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * This can be moved to within driver enter routine, * but that results in multiple copies of same code. */ - diff = ktime_us_delta(time_end, time_start); - if (diff > INT_MAX) - diff = INT_MAX; + diff = ktime_sub(time_end, time_start); - dev->last_residency = (int)diff; - dev->states_usage[entered_state].time += dev->last_residency; + dev->last_residency_ns = diff; + dev->states_usage[entered_state].time_ns += diff; dev->states_usage[entered_state].usage++; - if (diff < drv->states[entered_state].target_residency) { + if (diff < drv->states[entered_state].target_residency_ns) { for (i = entered_state - 1; i >= 0; i--) { if (dev->states_usage[i].disable) continue; @@ -281,14 +279,14 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, * Update if a deeper state would have been a * better match for the observed idle duration. */ - if (diff - delay >= drv->states[i].target_residency) + if (diff - delay >= drv->states[i].target_residency_ns) dev->states_usage[entered_state].below++; break; } } } else { - dev->last_residency = 0; + dev->last_residency_ns = 0; } return entered_state; @@ -381,7 +379,7 @@ u64 cpuidle_poll_time(struct cpuidle_driver *drv, if (dev->states_usage[i].disable) continue; - limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC; + limit_ns = (u64)drv->states[i].target_residency_ns; } dev->poll_limit_ns = limit_ns; @@ -552,7 +550,7 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev) static void __cpuidle_device_init(struct cpuidle_device *dev) { memset(dev->states_usage, 0, sizeof(dev->states_usage)); - dev->last_residency = 0; + dev->last_residency_ns = 0; dev->next_hrtimer = 0; } diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 9db154224999..fcaf8b2bab96 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -165,16 +165,27 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) if (!drv->cpumask) drv->cpumask = (struct cpumask *)cpu_possible_mask; - /* - * Look for the timer stop flag in the different states, so that we know - * if the broadcast timer has to be set up. The loop is in the reverse - * order, because usually one of the deeper states have this flag set. - */ - for (i = drv->state_count - 1; i >= 0 ; i--) { - if (drv->states[i].flags & CPUIDLE_FLAG_TIMER_STOP) { + for (i = 0; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + + /* + * Look for the timer stop flag in the different states and if + * it is found, indicate that the broadcast timer has to be set + * up. + */ + if (s->flags & CPUIDLE_FLAG_TIMER_STOP) drv->bctimer = 1; - break; - } + + /* + * The core will use the target residency and exit latency + * values in nanoseconds, but allow drivers to provide them in + * microseconds too. + */ + if (s->target_residency > 0) + s->target_residency_ns = s->target_residency * NSEC_PER_USEC; + + if (s->exit_latency > 0) + s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; } } diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index e9801f26c732..e48271e117a3 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -107,11 +107,14 @@ int cpuidle_register_governor(struct cpuidle_governor *gov) * cpuidle_governor_latency_req - Compute a latency constraint for CPU * @cpu: Target CPU */ -int cpuidle_governor_latency_req(unsigned int cpu) +s64 cpuidle_governor_latency_req(unsigned int cpu) { int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); struct device *device = get_cpu_device(cpu); int device_req = dev_pm_qos_raw_resume_latency(device); - return device_req < global_req ? device_req : global_req; + if (device_req > global_req) + device_req = global_req; + + return (s64)device_req * NSEC_PER_USEC; } diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c index 7a703d2e0064..cb2a96eafc02 100644 --- a/drivers/cpuidle/governors/haltpoll.c +++ b/drivers/cpuidle/governors/haltpoll.c @@ -49,7 +49,7 @@ static int haltpoll_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - int latency_req = cpuidle_governor_latency_req(dev->cpu); + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); if (!drv->state_count || latency_req == 0) { *stop_tick = false; @@ -75,10 +75,9 @@ static int haltpoll_select(struct cpuidle_driver *drv, return 0; } -static void adjust_poll_limit(struct cpuidle_device *dev, unsigned int block_us) +static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns) { unsigned int val; - u64 block_ns = block_us*NSEC_PER_USEC; /* Grow cpu_halt_poll_us if * cpu_halt_poll_us < block_ns < guest_halt_poll_us @@ -115,7 +114,7 @@ static void haltpoll_reflect(struct cpuidle_device *dev, int index) dev->last_state_idx = index; if (index != 0) - adjust_poll_limit(dev, dev->last_residency); + adjust_poll_limit(dev, dev->last_residency_ns); } /** diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index b0126b8c32fe..8e9058c4ea63 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -27,8 +27,8 @@ struct ladder_device_state { struct { u32 promotion_count; u32 demotion_count; - u32 promotion_time; - u32 demotion_time; + u64 promotion_time_ns; + u64 demotion_time_ns; } threshold; struct { int promotion_count; @@ -68,9 +68,10 @@ static int ladder_select_state(struct cpuidle_driver *drv, { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct ladder_device_state *last_state; - int last_residency, last_idx = dev->last_state_idx; + int last_idx = dev->last_state_idx; int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0; - int latency_req = cpuidle_governor_latency_req(dev->cpu); + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + s64 last_residency; /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) { @@ -80,13 +81,13 @@ static int ladder_select_state(struct cpuidle_driver *drv, last_state = &ldev->states[last_idx]; - last_residency = dev->last_residency - drv->states[last_idx].exit_latency; + last_residency = dev->last_residency_ns - drv->states[last_idx].exit_latency_ns; /* consider promotion */ if (last_idx < drv->state_count - 1 && !dev->states_usage[last_idx + 1].disable && - last_residency > last_state->threshold.promotion_time && - drv->states[last_idx + 1].exit_latency <= latency_req) { + last_residency > last_state->threshold.promotion_time_ns && + drv->states[last_idx + 1].exit_latency_ns <= latency_req) { last_state->stats.promotion_count++; last_state->stats.demotion_count = 0; if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { @@ -98,11 +99,11 @@ static int ladder_select_state(struct cpuidle_driver *drv, /* consider demotion */ if (last_idx > first_idx && (dev->states_usage[last_idx].disable || - drv->states[last_idx].exit_latency > latency_req)) { + drv->states[last_idx].exit_latency_ns > latency_req)) { int i; for (i = last_idx - 1; i > first_idx; i--) { - if (drv->states[i].exit_latency <= latency_req) + if (drv->states[i].exit_latency_ns <= latency_req) break; } ladder_do_selection(dev, ldev, last_idx, i); @@ -110,7 +111,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, } if (last_idx > first_idx && - last_residency < last_state->threshold.demotion_time) { + last_residency < last_state->threshold.demotion_time_ns) { last_state->stats.demotion_count++; last_state->stats.promotion_count = 0; if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { @@ -150,9 +151,9 @@ static int ladder_enable_device(struct cpuidle_driver *drv, lstate->threshold.demotion_count = DEMOTION_COUNT; if (i < drv->state_count - 1) - lstate->threshold.promotion_time = state->exit_latency; + lstate->threshold.promotion_time_ns = state->exit_latency_ns; if (i > first_idx) - lstate->threshold.demotion_time = state->exit_latency; + lstate->threshold.demotion_time_ns = state->exit_latency_ns; } return 0; diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 38b2b72102a8..b0a7ad566081 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,22 +19,12 @@ #include #include -/* - * Please note when changing the tuning values: - * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of - * a scaling operation multiplication may overflow on 32 bit platforms. - * In that case, #define RESOLUTION as ULL to get 64 bit result: - * #define RESOLUTION 1024ULL - * - * The default values do not overflow. - */ #define BUCKETS 12 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) #define RESOLUTION 1024 #define DECAY 8 -#define MAX_INTERESTING 50000 - +#define MAX_INTERESTING (50000 * NSEC_PER_USEC) /* * Concepts and ideas behind the menu governor @@ -120,14 +110,14 @@ struct menu_device { int needs_update; int tick_wakeup; - unsigned int next_timer_us; + u64 next_timer_ns; unsigned int bucket; unsigned int correction_factor[BUCKETS]; unsigned int intervals[INTERVALS]; int interval_ptr; }; -static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters) +static inline int which_bucket(u64 duration_ns, unsigned long nr_iowaiters) { int bucket = 0; @@ -140,15 +130,15 @@ static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters if (nr_iowaiters) bucket = BUCKETS/2; - if (duration < 10) + if (duration_ns < 10ULL * NSEC_PER_USEC) return bucket; - if (duration < 100) + if (duration_ns < 100ULL * NSEC_PER_USEC) return bucket + 1; - if (duration < 1000) + if (duration_ns < 1000ULL * NSEC_PER_USEC) return bucket + 2; - if (duration < 10000) + if (duration_ns < 10000ULL * NSEC_PER_USEC) return bucket + 3; - if (duration < 100000) + if (duration_ns < 100000ULL * NSEC_PER_USEC) return bucket + 4; return bucket + 5; } @@ -276,13 +266,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { struct menu_device *data = this_cpu_ptr(&menu_devices); - int latency_req = cpuidle_governor_latency_req(dev->cpu); - int i; - int idx; - unsigned int interactivity_req; + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); unsigned int predicted_us; + u64 predicted_ns; + u64 interactivity_req; unsigned long nr_iowaiters; ktime_t delta_next; + int i, idx; if (data->needs_update) { menu_update(drv, dev); @@ -290,14 +280,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* determine the expected residency time, round up */ - data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next)); + data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next); nr_iowaiters = nr_iowait_cpu(dev->cpu); - data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); + data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); if (unlikely(drv->state_count <= 1 || latency_req == 0) || - ((data->next_timer_us < drv->states[1].target_residency || - latency_req < drv->states[1].exit_latency) && + ((data->next_timer_ns < drv->states[1].target_residency_ns || + latency_req < drv->states[1].exit_latency_ns) && !dev->states_usage[0].disable)) { /* * In this case state[0] will be used no matter what, so return @@ -308,18 +298,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, return 0; } - /* - * Force the result of multiplication to be 64 bits even if both - * operands are 32 bits. - * Make sure to round up for half microseconds. - */ - predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us * - data->correction_factor[data->bucket], - RESOLUTION * DECAY); - /* - * Use the lowest expected idle interval to pick the idle state. - */ - predicted_us = min(predicted_us, get_typical_interval(data, predicted_us)); + /* Round up the result for half microseconds. */ + predicted_us = div_u64(data->next_timer_ns * + data->correction_factor[data->bucket] + + (RESOLUTION * DECAY * NSEC_PER_USEC) / 2, + RESOLUTION * DECAY * NSEC_PER_USEC); + /* Use the lowest expected idle interval to pick the idle state. */ + predicted_ns = (u64)min(predicted_us, + get_typical_interval(data, predicted_us)) * + NSEC_PER_USEC; if (tick_nohz_tick_stopped()) { /* @@ -330,14 +317,15 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * the known time till the closest timer event for the idle * state selection. */ - if (predicted_us < TICK_USEC) - predicted_us = ktime_to_us(delta_next); + if (predicted_ns < TICK_NSEC) + predicted_ns = delta_next; } else { /* * Use the performance multiplier and the user-configurable * latency_req to determine the maximum exit latency. */ - interactivity_req = predicted_us / performance_multiplier(nr_iowaiters); + interactivity_req = div64_u64(predicted_ns, + performance_multiplier(nr_iowaiters)); if (latency_req > interactivity_req) latency_req = interactivity_req; } @@ -356,19 +344,19 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx == -1) idx = i; /* first enabled state */ - if (s->target_residency > predicted_us) { + if (s->target_residency_ns > predicted_ns) { /* * Use a physical idle state, not busy polling, unless * a timer is going to trigger soon enough. */ if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && - s->exit_latency <= latency_req && - s->target_residency <= data->next_timer_us) { - predicted_us = s->target_residency; + s->exit_latency_ns <= latency_req && + s->target_residency_ns <= data->next_timer_ns) { + predicted_ns = s->target_residency_ns; idx = i; break; } - if (predicted_us < TICK_USEC) + if (predicted_ns < TICK_NSEC) break; if (!tick_nohz_tick_stopped()) { @@ -378,7 +366,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * tick in that case and let the governor run * again in the next iteration of the loop. */ - predicted_us = drv->states[idx].target_residency; + predicted_ns = drv->states[idx].target_residency_ns; break; } @@ -388,13 +376,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * closest timer event, select this one to avoid getting * stuck in the shallow one for too long. */ - if (drv->states[idx].target_residency < TICK_USEC && - s->target_residency <= ktime_to_us(delta_next)) + if (drv->states[idx].target_residency_ns < TICK_NSEC && + s->target_residency_ns <= delta_next) idx = i; return idx; } - if (s->exit_latency > latency_req) + if (s->exit_latency_ns > latency_req) break; idx = i; @@ -408,12 +396,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * expected idle duration is shorter than the tick period length. */ if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) { - unsigned int delta_next_us = ktime_to_us(delta_next); - + predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { *stop_tick = false; - if (idx > 0 && drv->states[idx].target_residency > delta_next_us) { + if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) { /* * The tick is not going to be stopped and the target * residency of the state to be returned is not within @@ -425,7 +411,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; idx = i; - if (drv->states[i].target_residency <= delta_next_us) + if (drv->states[i].target_residency_ns <= delta_next) break; } } @@ -461,7 +447,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) struct menu_device *data = this_cpu_ptr(&menu_devices); int last_idx = dev->last_state_idx; struct cpuidle_state *target = &drv->states[last_idx]; - unsigned int measured_us; + u64 measured_ns; unsigned int new_factor; /* @@ -479,7 +465,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * assume the state was never reached and the exit latency is 0. */ - if (data->tick_wakeup && data->next_timer_us > TICK_USEC) { + if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) { /* * The nohz code said that there wouldn't be any events within * the tick boundary (if the tick was stopped), but the idle @@ -489,7 +475,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * have been idle long (but not forever) to help the idle * duration predictor do a better job next time. */ - measured_us = 9 * MAX_INTERESTING / 10; + measured_ns = 9 * MAX_INTERESTING / 10; } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) && dev->poll_time_limit) { /* @@ -499,28 +485,29 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * the CPU might have been woken up from idle by the next timer. * Assume that to be the case. */ - measured_us = data->next_timer_us; + measured_ns = data->next_timer_ns; } else { /* measured value */ - measured_us = dev->last_residency; + measured_ns = dev->last_residency_ns; /* Deduct exit latency */ - if (measured_us > 2 * target->exit_latency) - measured_us -= target->exit_latency; + if (measured_ns > 2 * target->exit_latency_ns) + measured_ns -= target->exit_latency_ns; else - measured_us /= 2; + measured_ns /= 2; } /* Make sure our coefficients do not exceed unity */ - if (measured_us > data->next_timer_us) - measured_us = data->next_timer_us; + if (measured_ns > data->next_timer_ns) + measured_ns = data->next_timer_ns; /* Update our correction ratio */ new_factor = data->correction_factor[data->bucket]; new_factor -= new_factor / DECAY; - if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING) - new_factor += RESOLUTION * measured_us / data->next_timer_us; + if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING) + new_factor += div64_u64(RESOLUTION * measured_ns, + data->next_timer_ns); else /* * we were idle so long that we count it as a perfect @@ -540,7 +527,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->correction_factor[data->bucket] = new_factor; /* update the repeating-pattern data */ - data->intervals[data->interval_ptr++] = measured_us; + data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); if (data->interval_ptr >= INTERVALS) data->interval_ptr = 0; } diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 702d560eb347..ecbcfaefb0cd 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -104,7 +104,7 @@ struct teo_cpu { u64 sleep_length_ns; struct teo_idle_state states[CPUIDLE_STATE_MAX]; int interval_idx; - unsigned int intervals[INTERVALS]; + u64 intervals[INTERVALS]; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -117,9 +117,8 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns); int i, idx_hit = -1, idx_timer = -1; - unsigned int measured_us; + u64 measured_ns; if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { /* @@ -127,23 +126,21 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * enough to the closest timer event expected at the idle state * selection time to be discarded. */ - measured_us = UINT_MAX; + measured_ns = U64_MAX; } else { - unsigned int lat; + u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; - lat = drv->states[dev->last_state_idx].exit_latency; - - measured_us = ktime_to_us(cpu_data->time_span_ns); + measured_ns = cpu_data->time_span_ns; /* * The delay between the wakeup and the first instruction * executed by the CPU is not likely to be worst-case every * time, so take 1/2 of the exit latency as a very rough * approximation of the average of it. */ - if (measured_us >= lat) - measured_us -= lat / 2; + if (measured_ns >= lat_ns) + measured_ns -= lat_ns / 2; else - measured_us /= 2; + measured_ns /= 2; } /* @@ -155,9 +152,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; - if (drv->states[i].target_residency <= sleep_length_us) { + if (drv->states[i].target_residency_ns <= cpu_data->sleep_length_ns) { idx_timer = i; - if (drv->states[i].target_residency <= measured_us) + if (drv->states[i].target_residency_ns <= measured_ns) idx_hit = i; } } @@ -193,7 +190,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * Save idle duration values corresponding to non-timer wakeups for * pattern detection. */ - cpu_data->intervals[cpu_data->interval_idx++] = measured_us; + cpu_data->intervals[cpu_data->interval_idx++] = measured_ns; if (cpu_data->interval_idx > INTERVALS) cpu_data->interval_idx = 0; } @@ -203,11 +200,11 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * @drv: cpuidle driver containing state data. * @dev: Target CPU. * @state_idx: Index of the capping idle state. - * @duration_us: Idle duration value to match. + * @duration_ns: Idle duration value to match. */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - unsigned int duration_us) + u64 duration_ns) { int i; @@ -216,7 +213,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv, continue; state_idx = i; - if (drv->states[i].target_residency <= duration_us) + if (drv->states[i].target_residency_ns <= duration_ns) break; } return state_idx; @@ -232,8 +229,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - int latency_req = cpuidle_governor_latency_req(dev->cpu); - unsigned int duration_us, hits, misses, early_hits; + s64 latency_req = cpuidle_governor_latency_req(dev->cpu); + u64 duration_ns; + unsigned int hits, misses, early_hits; int max_early_idx, constraint_idx, idx, i; ktime_t delta_tick; @@ -244,8 +242,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, cpu_data->time_span_ns = local_clock(); - cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); - duration_us = ktime_to_us(cpu_data->sleep_length_ns); + duration_ns = tick_nohz_get_sleep_length(&delta_tick); + cpu_data->sleep_length_ns = duration_ns; hits = 0; misses = 0; @@ -262,7 +260,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * Ignore disabled states with target residencies beyond * the anticipated idle duration. */ - if (s->target_residency > duration_us) + if (s->target_residency_ns > duration_ns) continue; /* @@ -301,7 +299,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * shallow for that role. */ if (!(tick_nohz_tick_stopped() && - drv->states[idx].target_residency < TICK_USEC)) { + drv->states[idx].target_residency_ns < TICK_NSEC)) { early_hits = cpu_data->states[i].early_hits; max_early_idx = idx; } @@ -315,10 +313,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, misses = cpu_data->states[i].misses; } - if (s->target_residency > duration_us) + if (s->target_residency_ns > duration_ns) break; - if (s->exit_latency > latency_req && constraint_idx > i) + if (s->exit_latency_ns > latency_req && constraint_idx > i) constraint_idx = i; idx = i; @@ -327,7 +325,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (early_hits < cpu_data->states[i].early_hits && !(tick_nohz_tick_stopped() && - drv->states[i].target_residency < TICK_USEC)) { + drv->states[i].target_residency_ns < TICK_NSEC)) { early_hits = cpu_data->states[i].early_hits; max_early_idx = i; } @@ -343,7 +341,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (hits <= misses && max_early_idx >= 0) { idx = max_early_idx; - duration_us = drv->states[idx].target_residency; + duration_ns = drv->states[idx].target_residency_ns; } /* @@ -364,9 +362,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * the current expected idle duration value. */ for (i = 0; i < INTERVALS; i++) { - unsigned int val = cpu_data->intervals[i]; + u64 val = cpu_data->intervals[i]; - if (val >= duration_us) + if (val >= duration_ns) continue; count++; @@ -378,17 +376,17 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * values are in the interesting range. */ if (count > INTERVALS / 2) { - unsigned int avg_us = div64_u64(sum, count); + u64 avg_ns = div64_u64(sum, count); /* * Avoid spending too much time in an idle state that * would be too shallow. */ - if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { - duration_us = avg_us; - if (drv->states[idx].target_residency > avg_us) + if (!(tick_nohz_tick_stopped() && avg_ns < TICK_NSEC)) { + duration_ns = avg_ns; + if (drv->states[idx].target_residency_ns > avg_ns) idx = teo_find_shallower_state(drv, dev, - idx, avg_us); + idx, avg_ns); } } } @@ -398,9 +396,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * expected idle duration is shorter than the tick period length. */ if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) { - unsigned int delta_tick_us = ktime_to_us(delta_tick); - + duration_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { *stop_tick = false; /* @@ -409,8 +405,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * till the closest timer including the tick, try to correct * that. */ - if (idx > 0 && drv->states[idx].target_residency > delta_tick_us) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us); + if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) + idx = teo_find_shallower_state(drv, dev, idx, delta_tick); } return idx; @@ -454,7 +450,7 @@ static int teo_enable_device(struct cpuidle_driver *drv, memset(cpu_data, 0, sizeof(*cpu_data)); for (i = 0; i < INTERVALS; i++) - cpu_data->intervals[i] = UINT_MAX; + cpu_data->intervals[i] = U64_MAX; return 0; } diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index c8fa5f41dfc4..9f1ace9c53da 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -49,6 +49,8 @@ void cpuidle_poll_state_init(struct cpuidle_driver *drv) snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE"); state->exit_latency = 0; state->target_residency = 0; + state->exit_latency_ns = 0; + state->target_residency_ns = 0; state->power_usage = -1; state->enter = poll_idle; state->disabled = false; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 9f3755ac8f87..38ef770be90d 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -273,16 +273,30 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, \ return sprintf(buf, "%s\n", state->_name);\ } -define_show_state_function(exit_latency) -define_show_state_function(target_residency) +#define define_show_state_time_function(_name) \ +static ssize_t show_state_##_name(struct cpuidle_state *state, \ + struct cpuidle_state_usage *state_usage, \ + char *buf) \ +{ \ + return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \ +} + +define_show_state_time_function(exit_latency) +define_show_state_time_function(target_residency) define_show_state_function(power_usage) define_show_state_ull_function(usage) -define_show_state_ull_function(time) define_show_state_str_function(name) define_show_state_str_function(desc) define_show_state_ull_function(above) define_show_state_ull_function(below) +static ssize_t show_state_time(struct cpuidle_state *state, + struct cpuidle_state_usage *state_usage, + char *buf) +{ + return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns)); +} + static ssize_t show_state_disable(struct cpuidle_state *state, struct cpuidle_state_usage *state_usage, char *buf) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index d23a3b1ddcf6..22602747f468 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -35,7 +35,7 @@ struct cpuidle_driver; struct cpuidle_state_usage { unsigned long long disable; unsigned long long usage; - unsigned long long time; /* in US */ + u64 time_ns; unsigned long long above; /* Number of times it's been too deep */ unsigned long long below; /* Number of times it's been too shallow */ #ifdef CONFIG_SUSPEND @@ -48,6 +48,8 @@ struct cpuidle_state { char name[CPUIDLE_NAME_LEN]; char desc[CPUIDLE_DESC_LEN]; + u64 exit_latency_ns; + u64 target_residency_ns; unsigned int flags; unsigned int exit_latency; /* in US */ int power_usage; /* in mW */ @@ -89,7 +91,7 @@ struct cpuidle_device { ktime_t next_hrtimer; int last_state_idx; - int last_residency; + u64 last_residency_ns; u64 poll_limit_ns; struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX]; struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; @@ -263,7 +265,7 @@ struct cpuidle_governor { #ifdef CONFIG_CPU_IDLE extern int cpuidle_register_governor(struct cpuidle_governor *gov); -extern int cpuidle_governor_latency_req(unsigned int cpu); +extern s64 cpuidle_governor_latency_req(unsigned int cpu); #else static inline int cpuidle_register_governor(struct cpuidle_governor *gov) {return 0;} diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..1aa260702b38 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * update no idle residency and return. */ if (current_clr_polling_and_test()) { - dev->last_residency = 0; + dev->last_residency_ns = 0; local_irq_enable(); return -EBUSY; } -- cgit v1.2.3-59-g8ed1b From 20d087368d38c7350a4519a3b316ef7eb2504692 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Nov 2019 21:34:25 +0100 Subject: time: Optimize ns_to_timespec64() ns_to_timespec64() calls div_s64_rem(), which is a rather slow function on 32-bit architectures, as it cannot take advantage of the do_div() optimizations for constant arguments. Open-code the div_s64_rem() function in ns_to_timespec64(), so a constant divider can be passed into the optimized div_u64_rem() function. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108203435.112759-3-arnd@arndb.de --- kernel/time/time.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..45a358953f09 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -550,18 +550,21 @@ EXPORT_SYMBOL(set_normalized_timespec64); */ struct timespec64 ns_to_timespec64(const s64 nsec) { - struct timespec64 ts; + struct timespec64 ts = { 0, 0 }; s32 rem; - if (!nsec) - return (struct timespec64) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; + if (likely(nsec > 0)) { + ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + ts.tv_nsec = rem; + } else if (nsec < 0) { + /* + * With negative times, tv_sec points to the earlier + * second, and tv_nsec counts the nanoseconds since + * then, so tv_nsec is always a positive number. + */ + ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; + ts.tv_nsec = NSEC_PER_SEC - rem - 1; } - ts.tv_nsec = rem; return ts; } -- cgit v1.2.3-59-g8ed1b From 1d6acc18fee71a0db6e4fbbfbdb247e0bd5b0655 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 15 Oct 2019 13:03:39 +0530 Subject: time: Fix spelling mistake in comment witin => within Signed-off-by: Mukesh Ojha Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571124819-9639-1-git-send-email-mojha@codeaurora.org --- kernel/time/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 45a358953f09..ea6e7e47cc37 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return error; if (tz) { - /* Verify we're witin the +-15 hrs range */ + /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From cf25e24db61cc9df42c47485a2ec2bff4e9a3692 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2019 11:07:58 +0100 Subject: time: Rename tsk->real_start_time to ->start_boottime Since it stores CLOCK_BOOTTIME, not, as the name suggests, CLOCK_REALTIME, let's rename ->real_start_time to ->start_bootime. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/exec.c | 2 +- fs/proc/array.c | 2 +- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 555e93c7dec8..f4d0f3acf861 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1132,7 +1132,7 @@ static int de_thread(struct task_struct *tsk) * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; - tsk->real_start_time = leader->real_start_time; + tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); BUG_ON(has_group_leader_pid(tsk)); diff --git a/fs/proc/array.c b/fs/proc/array.c index 46dcb6f0eccf..5efaf3708ec6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -533,7 +533,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, nice = task_nice(task); /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(task->real_start_time); + start_time = nsec_to_clock_t(task->start_boottime); seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); diff --git a/include/linux/sched.h b/include/linux/sched.h index 67a1d86981a9..254128952eab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -857,7 +857,7 @@ struct task_struct { u64 start_time; /* Boot based time in nsecs: */ - u64 real_start_time; + u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..1392ee8f4848 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2130,7 +2130,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boottime_ns(); + p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. -- cgit v1.2.3-59-g8ed1b From 763e34e74bb7d5c316015e2e39fcc8520bfd071c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:07:06 -0500 Subject: ftrace: Add register_ftrace_direct() Add the start of the functionality to allow other trampolines to use the ftrace mcount/fentry/nop location. This adds two new functions: register_ftrace_direct() and unregister_ftrace_direct() Both take two parameters: the first is the instruction address of where the mcount/fentry/nop exists, and the second is the trampoline to have that location called. This will handle cases where ftrace is already used on that same location, and will make it still work, where the registered direct called trampoline will get called after all the registered ftrace callers are handled. Currently, it will not allow for IP_MODIFY functions to be called at the same locations, which include some kprobes and live kernel patching. At this point, no architecture supports this. This is only the start of implementing the framework. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 36 ++++++- kernel/trace/Kconfig | 8 ++ kernel/trace/ftrace.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 306 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8385cafe4f9f..efe3e521aff4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -144,6 +144,8 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * TRACE_ARRAY - The ops->private points to a trace_array descriptor. * PERMANENT - Set when the ops is permanent and should not be affected by * ftrace_enabled. + * DIRECT - Used by the direct ftrace_ops helper for direct functions + * (internal ftrace only, should not be used by others) */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -163,6 +165,7 @@ enum { FTRACE_OPS_FL_RCU = 1 << 14, FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, FTRACE_OPS_FL_PERMANENT = 1 << 16, + FTRACE_OPS_FL_DIRECT = 1 << 17, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -242,6 +245,32 @@ static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +int register_ftrace_direct(unsigned long ip, unsigned long addr); +int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +#else +static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return -ENODEV; +} +static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return -ENODEV; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * This must be implemented by the architecture. + * It is the way the ftrace direct_ops helper, when called + * via ftrace (because there's other callbacks besides the + * direct call), can inform the architecture's trampoline that this + * routine has a direct caller, and what the caller is. + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, + unsigned long addr) { } +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #ifdef CONFIG_STACK_TRACER extern int stack_tracer_enabled; @@ -333,6 +362,7 @@ bool is_ftrace_trampoline(unsigned long addr); * REGS_EN - the function is set up to save regs. * IPMODIFY - the record allows for the IP address to be changed. * DISABLED - the record is not ready to be touched yet + * DIRECT - there is a direct function to call * * When a new ftrace_ops is registered and wants a function to save * pt_regs, the rec->flag REGS is set. When the function has been @@ -348,10 +378,12 @@ enum { FTRACE_FL_TRAMP_EN = (1UL << 27), FTRACE_FL_IPMODIFY = (1UL << 26), FTRACE_FL_DISABLED = (1UL << 25), + FTRACE_FL_DIRECT = (1UL << 24), + FTRACE_FL_DIRECT_EN = (1UL << 23), }; -#define FTRACE_REF_MAX_SHIFT 25 -#define FTRACE_FL_BITS 7 +#define FTRACE_REF_MAX_SHIFT 23 +#define FTRACE_FL_BITS 9 #define FTRACE_FL_MASKED_BITS ((1UL << FTRACE_FL_BITS) - 1) #define FTRACE_FL_MASK (FTRACE_FL_MASKED_BITS << FTRACE_REF_MAX_SHIFT) #define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..624a05e99b0b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE config HAVE_DYNAMIC_FTRACE_WITH_REGS bool +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -557,6 +560,11 @@ config DYNAMIC_FTRACE_WITH_REGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_REGS +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0e7f03919de..329a3f3789a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1023,6 +1023,7 @@ static bool update_all_ops; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; + unsigned long direct; /* for direct lookup only */ }; struct ftrace_func_probe { @@ -1730,6 +1731,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return false; + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT; + /* * If there's only a single callback registered to a * function, and the ops has a trampoline registered @@ -1757,6 +1761,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return false; rec->flags--; + /* + * Only the internal direct_ops should have the + * DIRECT flag set. Thus, if it is removing a + * function, then that function should no longer + * be direct. + */ + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags &= ~FTRACE_FL_DIRECT; + /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -2092,15 +2105,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * If enabling and the REGS flag does not match the REGS_EN, or * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore * this record. Set flags to fail the compare against ENABLED. + * Same for direct calls. */ if (flag) { - if (!(rec->flags & FTRACE_FL_REGS) != + if (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)) flag |= FTRACE_FL_REGS; - if (!(rec->flags & FTRACE_FL_TRAMP) != + if (!(rec->flags & FTRACE_FL_TRAMP) != !(rec->flags & FTRACE_FL_TRAMP_EN)) flag |= FTRACE_FL_TRAMP; + + /* + * Direct calls are special, as count matters. + * We must test the record for direct, if the + * DIRECT and DIRECT_EN do not match, but only + * if the count is 1. That's because, if the + * count is something other than one, we do not + * want the direct enabled (it will be done via the + * direct helper). But if DIRECT_EN is set, and + * the count is not one, we need to clear it. + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_DIRECT) != + !(rec->flags & FTRACE_FL_DIRECT_EN)) + flag |= FTRACE_FL_DIRECT; + } else if (rec->flags & FTRACE_FL_DIRECT_EN) { + flag |= FTRACE_FL_DIRECT; + } } /* If the state of this record hasn't changed, then do nothing */ @@ -2125,6 +2157,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { + /* + * If there's only one user (direct_ops helper) + * then we can call the direct function + * directly (no ftrace trampoline). + */ + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT_EN; + else + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } else { + /* + * Can only call directly if there's + * only one callback to the function. + */ + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } + } } /* @@ -2154,7 +2205,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * and REGS states. The _EN flags must be disabled though. */ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | - FTRACE_FL_REGS_EN); + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); } ftrace_bug_type = FTRACE_BUG_NOP; @@ -2309,6 +2360,51 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* Protected by rcu_tasks for reading, and direct_mutex for writing */ +static struct ftrace_hash *direct_functions = EMPTY_HASH; +static DEFINE_MUTEX(direct_mutex); + +/* + * Search the direct_functions hash to see if the given instruction pointer + * has a direct caller attached to it. + */ +static unsigned long find_rec_direct(unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) + return 0; + + return entry->direct; +} + +static void call_direct_funcs(unsigned long ip, unsigned long pip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + unsigned long addr; + + addr = find_rec_direct(ip); + if (!addr) + return; + + arch_ftrace_set_direct_caller(regs, addr); +} + +struct ftrace_ops direct_ops = { + .func = call_direct_funcs, + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + | FTRACE_OPS_FL_PERMANENT, +}; +#else +static inline unsigned long find_rec_direct(unsigned long ip) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -2322,6 +2418,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + if ((rec->flags & FTRACE_FL_DIRECT) && + (ftrace_rec_count(rec) == 1)) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP) { @@ -2354,6 +2459,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + /* Direct calls take precedence over trampolines */ + if (rec->flags & FTRACE_FL_DIRECT_EN) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP_EN) { @@ -3465,10 +3579,11 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; - seq_printf(m, " (%ld)%s%s", + seq_printf(m, " (%ld)%s%s%s", ftrace_rec_count(rec), rec->flags & FTRACE_FL_REGS ? " R" : " ", - rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", + rec->flags & FTRACE_FL_DIRECT ? " D" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); if (ops) { @@ -3484,6 +3599,13 @@ static int t_show(struct seq_file *m, void *v) } else { add_trampoline_func(m, NULL, rec); } + if (rec->flags & FTRACE_FL_DIRECT) { + unsigned long direct; + + direct = find_rec_direct(rec->ip); + if (direct) + seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + } } seq_putc(m, '\n'); @@ -4815,6 +4937,143 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/** + * register_ftrace_direct - Call a custom trampoline directly + * @ip: The address of the nop at the beginning of a function + * @addr: The address of the trampoline to call at @ip + * + * This is used to connect a direct call from the nop location (@ip) + * at the start of ftrace traced functions. The location that it calls + * (@addr) must be able to handle a direct call, and save the parameters + * of the function being traced, and restore them (or inject new ones + * if needed), before returning. + * + * Returns: + * 0 on success + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *free_hash = NULL; + struct dyn_ftrace *rec; + int ret = -EBUSY; + + mutex_lock(&direct_mutex); + + /* See if there's a direct function at @ip already */ + if (find_rec_direct(ip)) + goto out_unlock; + + ret = -ENODEV; + rec = lookup_rec(ip, ip); + if (!rec) + goto out_unlock; + + /* + * Check if the rec says it has a direct call but we didn't + * find one earlier? + */ + if (WARN_ON(rec->flags & FTRACE_FL_DIRECT)) + goto out_unlock; + + /* Make sure the ip points to the exact record */ + ip = rec->ip; + + ret = -ENOMEM; + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + goto out_unlock; + + free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_unlock; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + if (ret) + remove_hash_entry(direct_functions, entry); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(&direct_ops); + if (ret) + ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + } + + if (ret) + kfree(entry); + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct); + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) { + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + goto out_unlock; + } + + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + if (direct_functions->count == 1) + unregister_ftrace_function(&direct_ops); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + + WARN_ON(ret); + + remove_hash_entry(direct_functions, entry); + + out_unlock: + mutex_unlock(&direct_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address * @ops - the ops to set the filter with -- cgit v1.2.3-59-g8ed1b From 013bf0da0474816f57739daa006c8564ad7396a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:11:27 -0500 Subject: ftrace: Add ftrace_find_direct_func() As function_graph tracer modifies the return address to insert a trampoline to trace the return of a function, it must be aware of a direct caller, as when it gets called, the function's return address may not be at on the stack where it expects. It may have to see if that return address points to the a direct caller and adjust if it is. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index efe3e521aff4..8b37b8105398 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -51,6 +51,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_direct_func; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -248,6 +249,7 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { @@ -257,6 +259,10 @@ static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; } +static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + return NULL; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 329a3f3789a1..c4446eabacbe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4938,6 +4938,46 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +struct ftrace_direct_func { + struct list_head next; + unsigned long addr; + int count; +}; + +static LIST_HEAD(ftrace_direct_funcs); + +/** + * ftrace_find_direct_func - test an address if it is a registered direct caller + * @addr: The address of a registered direct caller + * + * This searches to see if a ftrace direct caller has been registered + * at a specific address, and if so, it returns a descriptor for it. + * + * This can be used by architecture code to see if an address is + * a direct caller (trampoline) attached to a fentry/mcount location. + * This is useful for the function_graph tracer, as it may need to + * do adjustments if it traced a location that also has a direct + * trampoline attached to it. + */ +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *entry; + bool found = false; + + /* May be called by fgraph trampoline (protected by rcu tasks) */ + list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { + if (entry->addr == addr) { + found = true; + break; + } + } + if (found) + return entry; + + return NULL; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -4957,6 +4997,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { + struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5005,6 +5046,18 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; + direct = ftrace_find_direct_func(addr); + if (!direct) { + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) { + kfree(entry); + goto out_unlock; + } + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + } + entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5019,8 +5072,20 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } - if (ret) + if (ret) { kfree(entry); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + if (free_hash) + free_ftrace_hash(free_hash); + free_hash = NULL; + } + } else { + if (!direct->count) + direct->count++; + } out_unlock: mutex_unlock(&direct_mutex); @@ -5036,6 +5101,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_func_entry *entry; + struct ftrace_direct_func *direct; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5066,6 +5132,17 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); + direct = ftrace_find_direct_func(addr); + if (!WARN_ON(!direct)) { + /* This is the good path (see the ! before WARN) */ + direct->count--; + WARN_ON(direct->count < 0); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + } + } out_unlock: mutex_unlock(&direct_mutex); -- cgit v1.2.3-59-g8ed1b From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:12:57 -0500 Subject: ftrace/x86: Add a counter to test function_graph with direct As testing for direct calls from the function graph tracer adds a little overhead (which is a lot when tracing every function), add a counter that can be used to test if function_graph tracer needs to test for a direct caller or not. It would have been nicer if we could use a static branch, but the static branch logic fails when used within the function graph tracer trampoline. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 8 +++++--- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index fef283f6341d..060a361d9d11 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1049,9 +1049,11 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, * return address is actually off by one word, and we * need to adjust for that. */ - if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { - self_addr = *parent; - parent++; + if (ftrace_direct_func_count) { + if (ftrace_find_direct_func(self_addr + MCOUNT_INSN_SIZE)) { + self_addr = *parent; + parent++; + } } /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2bc7bd6b8387..55647e185141 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -247,10 +247,12 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else +# define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c4446eabacbe..f9456346ec66 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); +int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct->addr = addr; direct->count = 0; list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; } entry->ip = ip; @@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (free_hash) free_ftrace_hash(free_hash); free_hash = NULL; + ftrace_direct_func_count--; } } else { if (!direct->count) @@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) list_del_rcu(&direct->next); synchronize_rcu_tasks(); kfree(direct); + ftrace_direct_func_count--; } } out_unlock: -- cgit v1.2.3-59-g8ed1b From da537f0aef1372c5204356a7df06be8769467b7b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Oct 2019 14:38:07 -0400 Subject: ftrace: Add information on number of page groups allocated Looking for ways to shrink the size of the dyn_ftrace structure, knowing the information about how many pages and the number of groups of those pages, is useful in working out the best ways to save on memory. This adds one info print on how many groups of pages were used to allocate the ftrace dyn_ftrace structures, and also shows the number of pages and groups in the dyn_ftrace_total_info (which is used for debugging). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 ++++++++++++++ kernel/trace/trace.c | 21 +++++++++++++++------ kernel/trace/trace.h | 2 ++ 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9456346ec66..d2d488c43a6a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2991,6 +2991,8 @@ static void ftrace_shutdown_sysctl(void) static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; +unsigned long ftrace_number_of_pages; +unsigned long ftrace_number_of_groups; static inline int ops_traces_mod(struct ftrace_ops *ops) { @@ -3115,6 +3117,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) goto again; } + ftrace_number_of_pages += 1 << order; + ftrace_number_of_groups++; + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; pg->size = cnt; @@ -3170,6 +3175,8 @@ ftrace_allocate_pages(unsigned long num_to_init) start_pg = pg->next; kfree(pg); pg = start_pg; + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; @@ -6173,6 +6180,8 @@ void ftrace_release_mod(struct module *mod) free_pages((unsigned long)pg->records, order); tmp_page = pg->next; kfree(pg); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } } @@ -6514,6 +6523,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) @@ -6569,6 +6580,9 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + pr_info("ftrace: allocated %ld pages with %ld groups\n", + ftrace_number_of_pages, ftrace_number_of_groups); + set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a0ee9178365..5ea8c7c0f2d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7583,14 +7583,23 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned long *p = filp->private_data; - char buf[64]; /* Not too big for a shallow stack */ + ssize_t ret; + char *buf; int r; - r = scnprintf(buf, 63, "%ld", *p); - buf[r++] = '\n'; + /* 256 should be plenty to hold the amount needed */ + buf = kmalloc(256, GFP_KERNEL); + if (!buf) + return -ENOMEM; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; } static const struct file_operations tracing_dyn_info_fops = { @@ -8782,7 +8791,7 @@ static __init int tracer_init_tracefs(void) #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); + NULL, &tracing_dyn_info_fops); #endif create_trace_instances(d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..8b590f10bc72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +extern unsigned long ftrace_number_of_pages; +extern unsigned long ftrace_number_of_groups; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } -- cgit v1.2.3-59-g8ed1b From 91edde2e6ae1dd5e33812f076f3fe4cb7ccbfdd0 Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:21 +0200 Subject: ftrace: Implement fs notification for tracing_max_latency This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available. One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. The tracers may call the latency_fsnotify() from places such as __schedule() or do_idle(); this makes it impossible to call queue_work() directly without risking a deadlock. The same would happen with a softirq, kernel thread or tasklet. For this reason we use the irq_work mechanism to call queue_work(). This patch creates a new workqueue. The reason for doing this is that I wanted to use the WQ_UNBOUND and WQ_HIGHPRI flags. My thinking was that WQ_UNBOUND might help with the latency in some important cases. If we use: queue_work(system_highpri_wq, &tr->fsnotify_work); then the work will (almost) always execute on the same CPU but if we are unlucky that CPU could be too busy while there could be another CPU in the system that would be able to process the work soon enough. queue_work_on() could be used to queue the work on another CPU but it seems difficult to select the right CPU. Link: http://lkml.kernel.org/r/20191008220824.7911-2-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) [ Added max() to have one compare for max latency ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 18 +++++++++++ kernel/trace/trace_hwlat.c | 11 ++++--- 3 files changed, 98 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5ea8c7c0f2d7..f093a433cb42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -45,6 +45,9 @@ #include #include #include +#include +#include +#include #include "trace.h" #include "trace_output.h" @@ -1497,6 +1500,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; +static const struct file_operations tracing_max_lat_fops; + +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#endif #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -1536,6 +1607,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -8594,8 +8666,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b590f10bc72..718eb998c13e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -264,6 +266,11 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + struct irq_work fsnotify_irqwork; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +void latency_fsnotify(struct trace_array *tr); + +#else + +static void latency_fsnotify(struct trace_array *tr) { } + +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 862f4b0139fc..63526670605a 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -237,6 +237,7 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { struct hwlat_sample s; + u64 latency; ret = 1; @@ -253,11 +254,13 @@ static int get_sample(void) s.nmi_count = nmi_count; trace_hwlat_sample(&s); + latency = max(sample, outer_sample); + /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) - tr->max_latency = sample; - if (outer_sample > tr->max_latency) - tr->max_latency = outer_sample; + if (latency > tr->max_latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } } out: -- cgit v1.2.3-59-g8ed1b From 793937236d1ee032d2ee5ccc27bdd280a04e766e Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:22 +0200 Subject: preemptirq_delay_test: Add the burst feature and a sysfs trigger This burst feature enables the user to generate a burst of preempt/irqsoff latencies. This makes it possible to test whether we are able to detect latencies that systematically occur very close to each other. The maximum burst size is 10. We also create 10 identical test functions, so that we get 10 different backtraces; this is useful when we want to test whether we can detect all the latencies in a burst. Otherwise, there would be no easy way of differentiating between which latency in a burst was captured by the tracer. In addition, there is a sysfs trigger, so that it's not necessary to reload the module to repeat the test. The trigger will appear as /sys/kernel/preemptirq_delay_test/trigger in sysfs. Link: http://lkml.kernel.org/r/20191008220824.7911-3-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +- kernel/trace/preemptirq_delay_test.c | 144 ++++++++++++++++++++++++++++++----- 2 files changed, 128 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 624a05e99b0b..d25314bc7a1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -760,9 +760,9 @@ config PREEMPTIRQ_DELAY_TEST configurable delay. The module busy waits for the duration of the critical section. - For example, the following invocation forces a one-time irq-disabled - critical section for 500us: - modprobe preemptirq_delay_test test_mode=irq delay=500000 + For example, the following invocation generates a burst of three + irq-disabled critical sections for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 If unsure, say N diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index d8765c952fab..31c0fad4cb9e 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -10,18 +10,25 @@ #include #include #include +#include #include #include #include #include +#include static ulong delay = 100; -static char test_mode[10] = "irq"; +static char test_mode[12] = "irq"; +static uint burst_size = 1; -module_param_named(delay, delay, ulong, S_IRUGO); -module_param_string(test_mode, test_mode, 10, S_IRUGO); -MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); -MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); +module_param_named(delay, delay, ulong, 0444); +module_param_string(test_mode, test_mode, 12, 0444); +module_param_named(burst_size, burst_size, uint, 0444); +MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); +MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) { @@ -34,37 +41,136 @@ static void busy_wait(ulong time) } while ((end - start) < (time * 1000)); } -static int preemptirq_delay_run(void *data) +static __always_inline void irqoff_test(void) { unsigned long flags; + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); +} - if (!strcmp(test_mode, "irq")) { - local_irq_save(flags); - busy_wait(delay); - local_irq_restore(flags); - } else if (!strcmp(test_mode, "preempt")) { - preempt_disable(); - busy_wait(delay); - preempt_enable(); +static __always_inline void preemptoff_test(void) +{ + preempt_disable(); + busy_wait(delay); + preempt_enable(); +} + +static void execute_preemptirqtest(int idx) +{ + if (!strcmp(test_mode, "irq")) + irqoff_test(); + else if (!strcmp(test_mode, "preempt")) + preemptoff_test(); + else if (!strcmp(test_mode, "alternate")) { + if (idx % 2 == 0) + irqoff_test(); + else + preemptoff_test(); } +} + +#define DECLARE_TESTFN(POSTFIX) \ + static void preemptirqtest_##POSTFIX(int idx) \ + { \ + execute_preemptirqtest(idx); \ + } \ +/* + * We create 10 different functions, so that we can get 10 different + * backtraces. + */ +DECLARE_TESTFN(0) +DECLARE_TESTFN(1) +DECLARE_TESTFN(2) +DECLARE_TESTFN(3) +DECLARE_TESTFN(4) +DECLARE_TESTFN(5) +DECLARE_TESTFN(6) +DECLARE_TESTFN(7) +DECLARE_TESTFN(8) +DECLARE_TESTFN(9) + +static void (*testfuncs[])(int) = { + preemptirqtest_0, + preemptirqtest_1, + preemptirqtest_2, + preemptirqtest_3, + preemptirqtest_4, + preemptirqtest_5, + preemptirqtest_6, + preemptirqtest_7, + preemptirqtest_8, + preemptirqtest_9, +}; + +#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs) + +static int preemptirq_delay_run(void *data) +{ + int i; + int s = MIN(burst_size, NR_TEST_FUNCS); + + for (i = 0; i < s; i++) + (testfuncs[i])(i); return 0; } -static int __init preemptirq_delay_init(void) +static struct task_struct *preemptirq_start_test(void) { char task_name[50]; - struct task_struct *test_task; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + return kthread_run(preemptirq_delay_run, NULL, task_name); +} + + +static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + preemptirq_start_test(); + return count; +} + +static struct kobj_attribute trigger_attribute = + __ATTR(trigger, 0200, NULL, trigger_store); + +static struct attribute *attrs[] = { + &trigger_attribute.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *preemptirq_delay_kobj; + +static int __init preemptirq_delay_init(void) +{ + struct task_struct *test_task; + int retval; + + test_task = preemptirq_start_test(); + retval = PTR_ERR_OR_ZERO(test_task); + if (retval != 0) + return retval; + + preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test", + kernel_kobj); + if (!preemptirq_delay_kobj) + return -ENOMEM; + + retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group); + if (retval) + kobject_put(preemptirq_delay_kobj); - test_task = kthread_run(preemptirq_delay_run, NULL, task_name); - return PTR_ERR_OR_ZERO(test_task); + return retval; } static void __exit preemptirq_delay_exit(void) { - return; + kobject_put(preemptirq_delay_kobj); } module_init(preemptirq_delay_init) -- cgit v1.2.3-59-g8ed1b From 9c34fc4b7e903117cc27712b9e6c8690debb7e95 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:20 +0200 Subject: tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Add additional header output for PREEMPT_RT. Link: http://lkml.kernel.org/r/20191015191821.11479-34-bigeasy@linutronix.de Cc: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 2 +- kernel/trace/trace.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 740bd0224d35..2a05e770618a 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -146,7 +146,7 @@ FTRACE_OPS_FL_RECURSION_SAFE itself or any nested functions that those functions call. If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPT is set), + be called with preemption enabled (when CONFIG_PREEMPTION is set), but this is not guaranteed. FTRACE_OPS_FL_IPMODIFY diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f093a433cb42..db7d06a26861 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3726,6 +3726,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "desktop", #elif defined(CONFIG_PREEMPT) "preempt", +#elif defined(CONFIG_PREEMPT_RT) + "preempt_rt", #else "unknown", #endif -- cgit v1.2.3-59-g8ed1b From 6dff4d7dd3e0158688683a17dd792861aa9d61e2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 13:10:12 +0100 Subject: tracing: Make internal ftrace events static The event_class_ftrace_##call and event_##call do not seem to be used outside of trace_export.c so make them both static to avoid a number of sparse warnings: kernel/trace/trace_entries.h:59:1: warning: symbol 'event_class_ftrace_function' was not declared. Should it be static? kernel/trace/trace_entries.h:59:1: warning: symbol '__event_function' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol 'event_class_ftrace_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol '__event_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol 'event_class_ftrace_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol '__event_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol 'event_class_ftrace_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol '__event_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol 'event_class_ftrace_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol '__event_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol 'event_class_ftrace_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol '__event_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol 'event_class_ftrace_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol '__event_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol 'event_class_ftrace_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol '__event_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol 'event_class_ftrace_print' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol '__event_print' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol 'event_class_ftrace_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol '__event_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol 'event_class_ftrace_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol '__event_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol 'event_class_ftrace_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol '__event_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol 'event_class_ftrace_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol '__event_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol 'event_class_ftrace_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol '__event_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol 'event_class_ftrace_hwlat' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol '__event_hwlat' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191015121012.18824-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..2e6d2e9741cc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct trace_event_class __refdata event_class_ftrace_##call = { \ +static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ @@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct trace_event_call __used \ +static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -- cgit v1.2.3-59-g8ed1b From 2d6425af61166e026e7476db64f70f1266127b1d Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:23 -0700 Subject: tracing: Declare newly exported APIs in include/linux/trace.h Declare the newly introduced and exported APIs in the header file - include/linux/trace.h. Moving previous declarations from kernel/trace/trace.h to include/linux/trace.h. Link: http://lkml.kernel.org/r/1565805327-579-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 7 +++++++ kernel/trace/trace.h | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace.h b/include/linux/trace.h index b95ffb2188ab..24fcf07812ae 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -24,6 +24,13 @@ struct trace_export { int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); +struct trace_array; + +void trace_printk_init_buffers(void); +int trace_array_printk(struct trace_array *tr, unsigned long ip, + const char *fmt, ...); +struct trace_array *trace_array_create(const char *name); +int trace_array_destroy(struct trace_array *tr); #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 718eb998c13e..90cba68c8b50 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -873,8 +874,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...); int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -1890,7 +1889,6 @@ extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); -void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); -- cgit v1.2.3-59-g8ed1b From e585e6469d6f476b82aa148dc44aaf7ae269a4e2 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:24 -0700 Subject: tracing: Verify if trace array exists before destroying it. A trace array can be destroyed from userspace or kernel. Verify if the trace array exists before proceeding to destroy/remove it. Link: http://lkml.kernel.org/r/1565805327-579-3-git-send-email-divya.indi@oracle.com Reviewed-by: Aruna Ramakrishna Signed-off-by: Divya Indi [ Removed unneeded braces ] Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 6 +++++- kernel/trace/trace.c | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..6e2fd40a6ed9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); - module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod) if (err) return err; + /* Make module executable after ftrace is enabled */ + mutex_lock(&module_mutex); + module_enable_x(mod); + mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index db7d06a26861..fa4f742fc449 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8556,17 +8556,26 @@ static int __remove_instance(struct trace_array *tr) return 0; } -int trace_array_destroy(struct trace_array *tr) +int trace_array_destroy(struct trace_array *this_tr) { + struct trace_array *tr; int ret; - if (!tr) + if (!this_tr) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = __remove_instance(tr); + ret = -ENODEV; + + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + ret = __remove_instance(tr); + break; + } + } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); -- cgit v1.2.3-59-g8ed1b From 953ae45a0c25e09428d4a03d7654f97ab8a36647 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:25 -0700 Subject: tracing: Adding NULL checks for trace_array descriptor pointer As part of commit f45d1225adb0 ("tracing: Kernel access to Ftrace instances") we exported certain functions. Here, we are adding some additional NULL checks to ensure safe usage by users of these APIs. Link: http://lkml.kernel.org/r/1565805327-579-4-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4f742fc449..79fe4d6ecbd8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3297,6 +3297,9 @@ int trace_array_printk(struct trace_array *tr, if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; + if (!tr) + return -ENOENT; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..2a3ac2365445 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) char *event = NULL, *sub = NULL, *match; int ret; + if (!tr) + return -ENOENT; /* * The buf format can be : * *: means any event by that name. -- cgit v1.2.3-59-g8ed1b From b83b43ffc6e4b514ca034a0fbdee01322e2f7022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 15 Oct 2019 09:00:55 -0400 Subject: fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub The C compiler is allowing more checks to make sure that function pointers are assigned to the correct prototype function. Unfortunately, the function graph tracer uses a special name with its assigned ftrace_graph_return function pointer that maps to a stub function used by the function tracer (ftrace_stub). The ftrace_graph_return variable is compared to the ftrace_stub in some archs to know if the function graph tracer is enabled or not. This means we can not just simply create a new function stub that compares it without modifying all the archs. Instead, have the linker script create a function_graph_stub that maps to ftrace_stub, and this way we can define the prototype for it to match the prototype of ftrace_graph_return, and make the compiler checks all happy! Link: http://lkml.kernel.org/r/20191015090055.789a0aed@gandalf.local.home Cc: linux-sh@vger.kernel.org Cc: Yoshinori Sato Cc: Rich Felker Reported-by: Sami Tolvanen Signed-off-by: Steven Rostedt (VMware) --- arch/sh/boot/compressed/misc.c | 5 +++++ include/asm-generic/vmlinux.lds.h | 17 ++++++++++++++--- kernel/trace/fgraph.c | 11 ++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index c15cac9251b9..e69ec12cbbe6 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -111,6 +111,11 @@ void __stack_chk_fail(void) error("stack-protector: Kernel stack is corrupted\n"); } +/* Needed because vmlinux.lds.h references this */ +void ftrace_stub(void) +{ +} + #ifdef CONFIG_SUPERH64 #define stackalign 8 #else diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index dae64600ccbf..0f358be551cd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -111,18 +111,29 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY +/* + * Need to also make ftrace_graph_stub point to ftrace_stub + * so that the same stub location may have different protocols + * and not mess up with C verifiers. + */ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__patchable_function_entries)) \ - __stop_mcount_loc = .; + __stop_mcount_loc = .; \ + ftrace_graph_stub = ftrace_stub; #else #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ - __stop_mcount_loc = .; + __stop_mcount_loc = .; \ + ftrace_graph_stub = ftrace_stub; #endif #else -#define MCOUNT_REC() +# ifdef CONFIG_FUNCTION_TRACER +# define MCOUNT_REC() ftrace_graph_stub = ftrace_stub; +# else +# define MCOUNT_REC() +# endif #endif #ifdef CONFIG_TRACE_BRANCH_PROFILING diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 7950a0356042..fa3ce10d0405 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) return 0; } +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +extern void ftrace_graph_stub(struct ftrace_graph_ret *); + /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit v1.2.3-59-g8ed1b From 80042c8f06bf5a7b87a63deaa3deb56f2cd52645 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:56 +0300 Subject: tracing: Use generic type for comparator function Comparator function type, cmp_func_t, is defined in the types.h, use it in the code. Link: http://lkml.kernel.org/r/20191007135656.37734-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ kernel/trace/trace_branch.c | 8 ++++---- kernel/trace/trace_stat.c | 6 ++---- kernel/trace/trace_stat.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2d488c43a6a..82ef8d60a42b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* function graph compares on total time */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->time < b->time) return -1; @@ -479,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2) } #else /* not function graph compares against hits */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3ea65cdff30d..88e158d27965 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m) return 0; } -static inline long get_incorrect_percent(struct ftrace_branch_data *p) +static inline long get_incorrect_percent(const struct ftrace_branch_data *p) { long percent; @@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx) return p; } -static int annotated_branch_stat_cmp(void *p1, void *p2) +static int annotated_branch_stat_cmp(const void *p1, const void *p2) { - struct ftrace_branch_data *a = p1; - struct ftrace_branch_data *b = p2; + const struct ftrace_branch_data *a = p1; + const struct ftrace_branch_data *b = p2; long percent_a, percent_b; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 9ab0a1a7ad5e..874f1274cf99 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session) kfree(session); } -typedef int (*cmp_stat_t)(void *, void *); - -static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct stat_node *data; @@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) * This one will force an insertion as right-most node * in the rbtree. */ -static int dummy_cmp(void *p1, void *p2) +static int dummy_cmp(const void *p1, const void *p2) { return -1; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8786d17caf49..31d7dc5bf1db 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -16,7 +16,7 @@ struct tracer_stat { void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ - int (*stat_cmp)(void *p1, void *p2); + cmp_func_t stat_cmp; /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); /* Release an entry */ -- cgit v1.2.3-59-g8ed1b From 0c3c86bdc691c794a6154f8515b7fa82c82dfc4d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:17 -0700 Subject: tracing/hwlat: Fix a few trivial nits Update the source file name in the comments, and fix a grammatical error. Link: http://lkml.kernel.org/r/157073346821.17189.8946944856026592247.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 63526670605a..6638d63f0921 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * trace_hwlatdetect.c - A simple Hardware Latency detector. + * trace_hwlat.c - A simple Hardware Latency detector. * * Use this tracer to detect large system latencies induced by the behavior of * certain underlying system hardware or firmware, independent of Linux itself. @@ -279,7 +279,7 @@ static void move_to_next_cpu(void) return; /* * If for some reason the user modifies the CPU affinity - * of this thread, than stop migrating for the duration + * of this thread, then stop migrating for the duration * of the current test. */ if (!cpumask_equal(current_mask, current->cpus_ptr)) -- cgit v1.2.3-59-g8ed1b From 6ee40511cb838f9ced002dff7131bca87e3ccbdd Mon Sep 17 00:00:00 2001 From: Yuming Han Date: Thu, 24 Oct 2019 11:34:30 +0800 Subject: tracing: use kvcalloc for tgid_map array allocation Fail to allocate memory for tgid_map, because it requires order-6 page. detail as: c3 sh: page allocation failure: order:6, mode:0x140c0c0(GFP_KERNEL), nodemask=(null) c3 sh cpuset=/ mems_allowed=0 c3 CPU: 3 PID: 5632 Comm: sh Tainted: G W O 4.14.133+ #10 c3 Hardware name: Generic DT based system c3 Backtrace: c3 [] (dump_backtrace) from [](show_stack+0x18/0x1c) c3 [] (show_stack) from [](dump_stack+0x84/0xa4) c3 [] (dump_stack) from [](warn_alloc+0xc4/0x19c) c3 [] (warn_alloc) from [](__alloc_pages_nodemask+0xd18/0xf28) c3 [] (__alloc_pages_nodemask) from [](kmalloc_order+0x20/0x38) c3 [] (kmalloc_order) from [](kmalloc_order_trace+0x24/0x108) c3 [] (kmalloc_order_trace) from [](set_tracer_flag+0xb0/0x158) c3 [] (set_tracer_flag) from [](trace_options_core_write+0x7c/0xcc) c3 [] (trace_options_core_write) from [](__vfs_write+0x40/0x14c) c3 [] (__vfs_write) from [](vfs_write+0xc4/0x198) c3 [] (vfs_write) from [](SyS_write+0x6c/0xd0) c3 [] (SyS_write) from [](ret_fast_syscall+0x0/0x54) Switch to use kvcalloc to avoid unexpected allocation failures. Link: http://lkml.kernel.org/r/1571888070-24425-1-git-send-email-chunyan.zhang@unisoc.com Signed-off-by: Yuming Han Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 79fe4d6ecbd8..42659ce6ac0c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4686,7 +4686,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + tgid_map = kvcalloc(PID_MAX_DEFAULT + 1, sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { -- cgit v1.2.3-59-g8ed1b From c7411a1a126f649be71526a36d4afac9e5aefa13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Oct 2019 17:31:44 +0900 Subject: tracing/kprobe: Check whether the non-suffixed symbol is notrace Check whether the non-suffixed symbol is notrace, since suffixed symbols are generated by the compilers for optimization. Based on these suffixed symbols, notrace check might not work because some of them are just a partial code of the original function. (e.g. cold-cache (unlikely) code is separated from original function as FUNCTION.cold.XX) For example, without this fix, # echo p device_add.cold.67 > /sys/kernel/debug/tracing/kprobe_events sh: write error: Invalid argument # cat /sys/kernel/debug/tracing/error_log [ 135.491035] trace_kprobe: error: Failed to register probe event Command: p device_add.cold.67 ^ # dmesg | tail -n 1 [ 135.488599] trace_kprobe: Could not probe notrace function device_add.cold.67 With this, # echo p device_add.cold.66 > /sys/kernel/debug/tracing/kprobe_events # cat /sys/kernel/debug/kprobes/list ffffffff81599de9 k device_add.cold.66+0x0 [DISABLED] Actually, kprobe blacklist already did similar thing, see within_kprobe_blacklist(). Link: http://lkml.kernel.org/r/157233790394.6706.18243942030937189679.stgit@devnote2 Fixes: 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..7f890262c8a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call, #if defined(CONFIG_KPROBES_ON_FTRACE) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) -static bool within_notrace_func(struct trace_kprobe *tk) +static bool __within_notrace_func(unsigned long addr) { - unsigned long offset, size, addr; + unsigned long offset, size; - addr = trace_kprobe_address(tk); if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) return false; @@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk) */ return !ftrace_location_range(addr, addr + size - 1); } + +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long addr = addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN], *p; + + if (!__within_notrace_func(addr)) + return false; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return true; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_notrace_func(addr); + } + + return true; +} #else #define within_notrace_func(tk) (false) #endif -- cgit v1.2.3-59-g8ed1b From ef56e047b2bd4dabb801fd073dfcab5f40de5f78 Mon Sep 17 00:00:00 2001 From: Piotr Maziarz Date: Thu, 7 Nov 2019 13:45:38 +0100 Subject: tracing: Use seq_buf_hex_dump() to dump buffers Without this, buffers can be printed with __print_array macro that has no formatting options and can be hard to read. The other way is to mimic formatting capability with multiple calls of trace event with one call per row which gives performance impact and different timestamp in each row. Link: http://lkml.kernel.org/r/1573130738-29390-2-git-send-email-piotrx.maziarz@linux.intel.com Signed-off-by: Piotr Maziarz Signed-off-by: Cezary Rojewski Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 5 +++++ include/linux/trace_seq.h | 4 ++++ include/trace/trace_events.h | 6 ++++++ kernel/trace/trace_output.c | 15 +++++++++++++++ kernel/trace/trace_seq.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 30a8cdcfd4a4..60a41b7069dd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -45,6 +45,11 @@ const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + struct trace_iterator; struct trace_event; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 6609b39a7232..6c30508fca19 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -92,6 +92,10 @@ extern int trace_seq_path(struct trace_seq *s, const struct path *path); extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, int nmaskbits); +extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + #else /* CONFIG_TRACING */ static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 4ecdfe2e3580..7089760d4c7a 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -340,6 +340,12 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_array_seq(p, array, count, el_size); \ }) +#undef __print_hex_dump +#define __print_hex_dump(prefix_str, prefix_type, \ + rowsize, groupsize, buf, len, ascii) \ + trace_print_hex_dump_seq(p, prefix_str, prefix_type, \ + rowsize, groupsize, buf, len, ascii) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static notrace enum print_line_t \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d54ce252b05a..d9b4b7c22db4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count, } EXPORT_SYMBOL(trace_print_array_seq); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_putc(p, '\n'); + trace_seq_hex_dump(p, prefix_str, prefix_type, + rowsize, groupsize, buf, len, ascii); + trace_seq_putc(p, 0); + return ret; +} +EXPORT_SYMBOL(trace_print_hex_dump_seq); + int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 6b1c562ffdaf..344e4c1aa09c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); + +int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_hex_dump(&(s->seq), prefix_str, + prefix_type, rowsize, groupsize, + buf, len, ascii); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL(trace_seq_hex_dump); -- cgit v1.2.3-59-g8ed1b From 9b4712044d059e7842aaeeafd7c7a7ee88c589db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Nov 2019 18:42:19 +0100 Subject: tracing: Remove stray tab in TRACE_EVAL_MAP_FILE's help text There was a stray tab in the help text of the aforementioned config option which showed like this: The "print fmt" of the trace events will show the enum/sizeof names instead of their values. This can cause problems for user space tools ... in menuconfig. Remove it and end a sentence with a fullstop. No functional changes. Link: http://lkml.kernel.org/r/20191112174219.10933-1-bp@alien8.de Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d25314bc7a1c..b872716bb2a0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -771,7 +771,7 @@ config TRACE_EVAL_MAP_FILE depends on TRACING help The "print fmt" of the trace events will show the enum/sizeof names - instead of their values. This can cause problems for user space tools + instead of their values. This can cause problems for user space tools that use this string to parse the raw data as user space does not know how to convert the string to its value. @@ -792,7 +792,7 @@ config TRACE_EVAL_MAP_FILE they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. - If unsure, say N + If unsure, say N. config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" -- cgit v1.2.3-59-g8ed1b From 36b3615dc3b625c8b587f34e413a600f7ac16403 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 22:43:58 -0500 Subject: tracing: Add missing "inline" in stub function of latency_fsnotify() The latency_fsnotify() stub when the function is not defined, was missing the "inline". Link: https://lore.kernel.org/r/20191115140213.74c5efe7@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90cba68c8b50..2df8aed6a8f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -801,7 +801,7 @@ void latency_fsnotify(struct trace_array *tr); #else -static void latency_fsnotify(struct trace_array *tr) { } +static inline void latency_fsnotify(struct trace_array *tr) { } #endif -- cgit v1.2.3-59-g8ed1b From 0567d6809182df53da03636fad36c507c5cf07a5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 14:39:35 -0500 Subject: ftrace: Add modify_ftrace_direct() Add a new function modify_ftrace_direct() that will allow a user to update an existing direct caller to a new trampoline, without missing hits due to unregistering one and then adding another. Link: https://lore.kernel.org/r/20191109022907.6zzo6orhxpt5n2sv@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 55647e185141..73eb2e93593f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); +int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); #else # define ftrace_direct_func_count 0 @@ -261,6 +262,11 @@ static inline int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { return -ENODEV; } +static inline int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + return -ENODEV; +} static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) { return NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 82ef8d60a42b..834f3556ea1e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5160,6 +5160,84 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct); + +static struct ftrace_ops stub_ops = { + .func = ftrace_stub, +}; + +/** + * modify_ftrace_direct - Modify an existing direct call to call something else + * @ip: The instruction pointer to modify + * @old_addr: The address that the current @ip calls directly + * @new_addr: The address that the @ip should call + * + * This modifies a ftrace direct caller at an instruction pointer without + * having to disable it first. The direct call will switch over to the + * @new_addr without missing anything. + * + * Returns: zero on success. Non zero on error, which includes: + * -ENODEV : the @ip given has no direct caller attached + * -EINVAL : the @old_addr does not match the current direct caller + */ +int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) + goto out_unlock; + + ip = rec->ip; + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + ret = -EINVAL; + if (entry->direct != old_addr) + goto out_unlock; + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_unlock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_unlock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + ret = 0; + + out_unlock: + mutex_unlock(&direct_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3-59-g8ed1b From e9838bd51169af87ae248336d4c3fc59184a0e46 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Nov 2019 18:12:01 +0100 Subject: irq_work: Fix IRQ_WORK_BUSY bit clearing While attempting to clear the busy bit at the end of a work execution, atomic_cmpxchg() expects the value of the flags with the pending bit cleared as the old value. However by mistake the value of the flags is passed without clearing the pending bit first. As a result, clearing the busy bit fails and irq_work_sync() may stall: watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [blktrace:4948] CPU: 0 PID: 4948 Comm: blktrace Not tainted 5.4.0-rc7-00003-gfeb4a51323bab #1 RIP: 0010:irq_work_sync+0x4/0x10 Call Trace: relay_close_buf+0x19/0x50 relay_close+0x64/0x100 blk_trace_free+0x1f/0x50 __blk_trace_remove+0x1e/0x30 blk_trace_ioctl+0x11b/0x140 blkdev_ioctl+0x6c1/0xa40 block_ioctl+0x39/0x40 do_vfs_ioctl+0xa5/0x700 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1d0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 So clear the appropriate bit before passing the old flags to cmpxchg(). Fixes: feb4a51323ba ("irq_work: Slightly simplify IRQ_WORK_PENDING clearing") Reported-by: kernel test robot Reported-by: Leonard Crestez Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Leonard Crestez Link: https://lkml.kernel.org/r/20191113171201.14032-1-frederic@kernel.org --- kernel/irq_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 49c53f80a13a..828cc30774bc 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -158,6 +158,7 @@ static void irq_work_run_list(struct llist_head *list) * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ + flags &= ~IRQ_WORK_PENDING; (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } -- cgit v1.2.3-59-g8ed1b From 20a15ee040f23bd553d4e6bbb1f8724ccd282abc Mon Sep 17 00:00:00 2001 From: luanshi Date: Wed, 13 Nov 2019 22:41:33 +0800 Subject: genirq: Fix function documentation of __irq_alloc_descs() The function got renamed at some point, but the kernel-doc was not updated. Signed-off-by: Liguang Zhang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1573656093-8643-1-git-send-email-zhangliguang@linux.alibaba.com --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9be995fc3c5a..5b8fdd659e54 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -750,7 +750,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) EXPORT_SYMBOL_GPL(irq_free_descs); /** - * irq_alloc_descs - allocate and initialize a range of irq descriptors + * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. -- cgit v1.2.3-59-g8ed1b From 5d603311615f612320bb77bd2a82553ef1ced5b7 Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Wed, 13 Nov 2019 12:29:50 +0300 Subject: kernel/module.c: wakeup processes in module_wq on module unload Fix the race between load and unload a kernel module. sys_delete_module() try_stop_module() mod->state = _GOING add_unformed_module() old = find_module_all() (old->state == _GOING => wait_event_interruptible()) During pre-condition finished_loading() rets 0 schedule() (never gets waken up later) free_module() mod->state = _UNFORMED list_del_rcu(&mod->list) (dels mod from "modules" list) return The race above leads to modprobe hanging forever on loading a module. Error paths on loading module call wake_up_all(&module_wq) after freeing module, so let's do the same on straight module unload. Fixes: 6e6de3dee51a ("kernel/module.c: Only return -EEXIST for modules that have finished loading") Reviewed-by: Prarit Bhargava Signed-off-by: Konstantin Khorenko Signed-off-by: Jessica Yu --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 26c13173da3d..bdbf95726cb7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1033,6 +1033,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); free_module(mod); + /* someone could wait for the module in add_unformed_module() */ + wake_up_all(&module_wq); return 0; out: mutex_unlock(&module_mutex); -- cgit v1.2.3-59-g8ed1b From 3ca47e958a64b1116a2c35e65dcf467fc53d52de Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Apr 2019 17:43:50 +0200 Subject: y2038: remove CONFIG_64BIT_TIME The CONFIG_64BIT_TIME option is defined on all architectures, and can be removed for simplicity now. Signed-off-by: Arnd Bergmann --- arch/Kconfig | 8 -------- fs/aio.c | 2 +- ipc/syscall.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/time.c | 4 ++-- net/socket.c | 2 +- 6 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 5f8a5d84dbbe..0e1fded2940e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -796,14 +796,6 @@ config OLD_SIGACTION config COMPAT_OLD_SIGACTION bool -config 64BIT_TIME - def_bool y - help - This should be selected by all architectures that need to support - new system calls with a 64-bit time_t. This is relevant on all 32-bit - architectures, and 64-bit architectures as part of compat syscall - handling. - config COMPAT_32BIT_TIME def_bool !64BIT || COMPAT help diff --git a/fs/aio.c b/fs/aio.c index 01e0fb9ae45a..447e3a0c572c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2056,7 +2056,7 @@ static long do_io_getevents(aio_context_t ctx_id, * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, diff --git a/ipc/syscall.c b/ipc/syscall.c index 581bdff4e7c5..dfb0e988d542 100644 --- a/ipc/syscall.c +++ b/ipc/syscall.c @@ -30,7 +30,7 @@ int ksys_ipc(unsigned int call, int first, unsigned long second, return ksys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); case SEMTIMEDOP: - if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + if (IS_ENABLED(CONFIG_64BIT)) return ksys_semtimedop(first, ptr, second, (const struct __kernel_timespec __user *)fifth); else if (IS_ENABLED(CONFIG_COMPAT_32BIT_TIME)) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 65605530ee34..9e20873148c6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1940,7 +1940,7 @@ out: return ret; } -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..96b8c02657ed 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -267,7 +267,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ @@ -881,7 +881,7 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; diff --git a/net/socket.c b/net/socket.c index 6a9ab7a8b1d2..98f6544b0096 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2833,7 +2833,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) a[2], true); break; case SYS_RECVMMSG: - if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + if (IS_ENABLED(CONFIG_64BIT)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], -- cgit v1.2.3-59-g8ed1b From 2a785996cc5e2fc1d1d29d196f530905f68d2dc2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Nov 2019 11:10:01 +0100 Subject: y2038: uapi: change __kernel_time_t to __kernel_old_time_t This is mainly a patch for clarification, and to let us remove the time_t definition from the kernel to prevent new users from creeping in that might not be y2038-safe. All remaining uses of 'time_t' or '__kernel_time_t' are part of the user API that cannot be changed by that either have a replacement or that do not suffer from the y2038 overflow. Acked-by: Deepa Dinamani Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 4 ++-- include/linux/time32.h | 2 +- include/linux/types.h | 2 +- include/uapi/linux/cyclades.h | 6 +++--- include/uapi/linux/msg.h | 6 +++--- include/uapi/linux/ppp_defs.h | 4 ++-- include/uapi/linux/sem.h | 4 ++-- include/uapi/linux/shm.h | 6 +++--- include/uapi/linux/time.h | 6 +++--- include/uapi/linux/time_types.h | 4 ++-- include/uapi/linux/utime.h | 4 ++-- kernel/time/time.c | 6 +++--- 12 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f7c561c4dcdd..2f27bc9d5ef0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1076,7 +1076,7 @@ asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice); asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpgrp(void); asmlinkage long sys_pause(void); -asmlinkage long sys_time(time_t __user *tloc); +asmlinkage long sys_time(__kernel_old_time_t __user *tloc); asmlinkage long sys_time32(old_time32_t __user *tloc); #ifdef __ARCH_WANT_SYS_UTIME asmlinkage long sys_utime(char __user *filename, @@ -1116,7 +1116,7 @@ asmlinkage long sys_sysfs(int option, asmlinkage long sys_fork(void); /* obsolete: kernel/time/time.c */ -asmlinkage long sys_stime(time_t __user *tptr); +asmlinkage long sys_stime(__kernel_old_time_t __user *tptr); asmlinkage long sys_stime32(old_time32_t __user *tptr); /* obsolete: kernel/signal.c */ diff --git a/include/linux/time32.h b/include/linux/time32.h index 0a1f302a1753..cad4c3186002 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -12,7 +12,7 @@ #include #include -#define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) +#define TIME_T_MAX (__kernel_old_time_t)((1UL << ((sizeof(__kernel_old_time_t) << 3) - 1)) - 1) typedef s32 old_time32_t; diff --git a/include/linux/types.h b/include/linux/types.h index 05030f608be3..e32c1180b742 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -67,7 +67,7 @@ typedef __kernel_ptrdiff_t ptrdiff_t; #ifndef _TIME_T #define _TIME_T -typedef __kernel_time_t time_t; +typedef __kernel_old_time_t time_t; #endif #ifndef _CLOCK_T diff --git a/include/uapi/linux/cyclades.h b/include/uapi/linux/cyclades.h index 8279bc3d60ca..fc0add2194a9 100644 --- a/include/uapi/linux/cyclades.h +++ b/include/uapi/linux/cyclades.h @@ -83,9 +83,9 @@ struct cyclades_monitor { * open) */ struct cyclades_idle_stats { - __kernel_time_t in_use; /* Time device has been in use (secs) */ - __kernel_time_t recv_idle; /* Time since last char received (secs) */ - __kernel_time_t xmit_idle; /* Time since last char transmitted (secs) */ + __kernel_old_time_t in_use; /* Time device has been in use (secs) */ + __kernel_old_time_t recv_idle; /* Time since last char received (secs) */ + __kernel_old_time_t xmit_idle; /* Time since last char transmitted (secs) */ unsigned long recv_bytes; /* Bytes received */ unsigned long xmit_bytes; /* Bytes transmitted */ unsigned long overruns; /* Input overruns */ diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index e4a0d9a9a9e8..01ee8d54c1c8 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -19,9 +19,9 @@ struct msqid_ds { struct ipc_perm msg_perm; struct msg *msg_first; /* first message on queue,unused */ struct msg *msg_last; /* last message in queue,unused */ - __kernel_time_t msg_stime; /* last msgsnd time */ - __kernel_time_t msg_rtime; /* last msgrcv time */ - __kernel_time_t msg_ctime; /* last change time */ + __kernel_old_time_t msg_stime; /* last msgsnd time */ + __kernel_old_time_t msg_rtime; /* last msgrcv time */ + __kernel_old_time_t msg_ctime; /* last change time */ unsigned long msg_lcbytes; /* Reuse junk fields for 32 bit */ unsigned long msg_lqbytes; /* ditto */ unsigned short msg_cbytes; /* current number of bytes on queue */ diff --git a/include/uapi/linux/ppp_defs.h b/include/uapi/linux/ppp_defs.h index fff51b91b409..9277a33d08a2 100644 --- a/include/uapi/linux/ppp_defs.h +++ b/include/uapi/linux/ppp_defs.h @@ -144,8 +144,8 @@ struct ppp_comp_stats { * the last NP packet was sent or received. */ struct ppp_idle { - __kernel_time_t xmit_idle; /* time since last NP packet sent */ - __kernel_time_t recv_idle; /* time since last NP packet received */ + __kernel_old_time_t xmit_idle; /* time since last NP packet sent */ + __kernel_old_time_t recv_idle; /* time since last NP packet received */ }; #endif /* _UAPI_PPP_DEFS_H_ */ diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 39a1876f039e..75aa3b273cd9 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -24,8 +24,8 @@ /* Obsolete, used only for backwards compatibility and libc5 compiles */ struct semid_ds { struct ipc_perm sem_perm; /* permissions .. see ipc.h */ - __kernel_time_t sem_otime; /* last semop time */ - __kernel_time_t sem_ctime; /* create/last semctl() time */ + __kernel_old_time_t sem_otime; /* last semop time */ + __kernel_old_time_t sem_ctime; /* create/last semctl() time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct sem_queue *sem_pending; /* pending operations to be processed */ struct sem_queue **sem_pending_last; /* last pending operation */ diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 6507ad0afc81..8d1f17a4e08e 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -28,9 +28,9 @@ struct shmid_ds { struct ipc_perm shm_perm; /* operation perms */ int shm_segsz; /* size of segment (bytes) */ - __kernel_time_t shm_atime; /* last attach time */ - __kernel_time_t shm_dtime; /* last detach time */ - __kernel_time_t shm_ctime; /* last change time */ + __kernel_old_time_t shm_atime; /* last attach time */ + __kernel_old_time_t shm_dtime; /* last detach time */ + __kernel_old_time_t shm_ctime; /* last change time */ __kernel_ipc_pid_t shm_cpid; /* pid of creator */ __kernel_ipc_pid_t shm_lpid; /* pid of last operator */ unsigned short shm_nattch; /* no. of current attaches */ diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 958932effc5e..a655aa28dc6e 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -8,13 +8,13 @@ #ifndef _STRUCT_TIMESPEC #define _STRUCT_TIMESPEC struct timespec { - __kernel_time_t tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ + __kernel_old_time_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ }; #endif struct timeval { - __kernel_time_t tv_sec; /* seconds */ + __kernel_old_time_t tv_sec; /* seconds */ __kernel_suseconds_t tv_usec; /* microseconds */ }; diff --git a/include/uapi/linux/time_types.h b/include/uapi/linux/time_types.h index 60b37f29842d..074e391d73a1 100644 --- a/include/uapi/linux/time_types.h +++ b/include/uapi/linux/time_types.h @@ -29,8 +29,8 @@ struct __kernel_old_timeval { #endif struct __kernel_old_timespec { - __kernel_time_t tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ + __kernel_old_time_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ }; struct __kernel_sock_timeval { diff --git a/include/uapi/linux/utime.h b/include/uapi/linux/utime.h index fd9aa26b6860..bc8f13e81d6e 100644 --- a/include/uapi/linux/utime.h +++ b/include/uapi/linux/utime.h @@ -5,8 +5,8 @@ #include struct utimbuf { - __kernel_time_t actime; - __kernel_time_t modtime; + __kernel_old_time_t actime; + __kernel_old_time_t modtime; }; #endif diff --git a/kernel/time/time.c b/kernel/time/time.c index 96b8c02657ed..833abae3364f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz); * why not move it into the appropriate arch directory (for those * architectures that need it). */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) +SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { - time_t i = (time_t)ktime_get_real_seconds(); + __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) * architectures that need it). */ -SYSCALL_DEFINE1(stime, time_t __user *, tptr) +SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; -- cgit v1.2.3-59-g8ed1b From bdd565f817a74b9e30edec108f7cb1dbc762b8a6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 22:46:48 +0200 Subject: y2038: rusage: use __kernel_old_timeval There are two 'struct timeval' fields in 'struct rusage'. Unfortunately the definition of timeval is now ambiguous when used in user space with a libc that has a 64-bit time_t, and this also changes the 'rusage' definition in user space in a way that is incompatible with the system call interface. While there is no good solution to avoid all ambiguity here, change the definition in the kernel headers to be compatible with the kernel ABI, using __kernel_old_timeval as an unambiguous base type. In previous discussions, there was also a plan to add a replacement for rusage based on 64-bit timestamps and nanosecond resolution, i.e. 'struct __kernel_timespec'. I have patches for that as well, if anyone thinks we should do that. Reviewed-by: Cyrill Gorcunov Signed-off-by: Arnd Bergmann --- arch/alpha/kernel/osf_sys.c | 2 +- include/uapi/linux/resource.h | 4 ++-- kernel/sys.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index bf497b8b0ec6..bbe7a0da6264 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -963,7 +963,7 @@ put_tv32(struct timeval32 __user *o, struct timespec64 *i) } static inline long -put_tv_to_tv32(struct timeval32 __user *o, struct timeval *i) +put_tv_to_tv32(struct timeval32 __user *o, struct __kernel_old_timeval *i) { return copy_to_user(o, &(struct timeval32){ .tv_sec = i->tv_sec, diff --git a/include/uapi/linux/resource.h b/include/uapi/linux/resource.h index cc00fd079631..74ef57b38f9f 100644 --- a/include/uapi/linux/resource.h +++ b/include/uapi/linux/resource.h @@ -22,8 +22,8 @@ #define RUSAGE_THREAD 1 /* only the calling thread */ struct rusage { - struct timeval ru_utime; /* user time used */ - struct timeval ru_stime; /* system time used */ + struct __kernel_old_timeval ru_utime; /* user time used */ + struct __kernel_old_timeval ru_stime; /* system time used */ __kernel_long_t ru_maxrss; /* maximum resident set size */ __kernel_long_t ru_ixrss; /* integral shared memory size */ __kernel_long_t ru_idrss; /* integral unshared data size */ diff --git a/kernel/sys.c b/kernel/sys.c index a611d1d58c7d..d3aef31e24dc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1763,8 +1763,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) unlock_task_sighand(p, &flags); out: - r->ru_utime = ns_to_timeval(utime); - r->ru_stime = ns_to_timeval(stime); + r->ru_utime = ns_to_kernel_old_timeval(utime); + r->ru_stime = ns_to_kernel_old_timeval(stime); if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); -- cgit v1.2.3-59-g8ed1b From 75d319c06e6a76f67549c0ae1007dc3167804f4e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 22:56:17 +0200 Subject: y2038: syscalls: change remaining timeval to __kernel_old_timeval All of the remaining syscalls that pass a timeval (gettimeofday, utime, futimesat) can trivially be changed to pass a __kernel_old_timeval instead, which has a compatible layout, but avoids ambiguity with the timeval type in user space. Acked-by: Christian Brauner Acked-by: Rafael J. Wysocki Signed-off-by: Arnd Bergmann --- arch/powerpc/include/asm/asm-prototypes.h | 3 ++- arch/powerpc/kernel/syscalls.c | 4 ++-- fs/select.c | 10 +++++----- fs/utimes.c | 8 ++++---- include/linux/syscalls.h | 10 +++++----- kernel/power/power.h | 2 +- kernel/time/time.c | 2 +- 7 files changed, 20 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 8561498e653c..2c25dc079cb9 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -92,7 +92,8 @@ long sys_swapcontext(struct ucontext __user *old_ctx, long sys_debug_setcontext(struct ucontext __user *ctx, int ndbg, struct sig_dbg_op __user *dbg); int -ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp); +ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, + struct __kernel_old_timeval __user *tvp); unsigned long __init early_init(unsigned long dt_ptr); void __init machine_init(u64 dt_ptr); #endif diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 3bfb3888e897..078608ec2e92 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -79,7 +79,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, * sys_select() with the appropriate args. -- Cort */ int -ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) +ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { if ( (unsigned long)n >= 4096 ) { @@ -89,7 +89,7 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s || __get_user(inp, ((fd_set __user * __user *)(buffer+1))) || __get_user(outp, ((fd_set __user * __user *)(buffer+2))) || __get_user(exp, ((fd_set __user * __user *)(buffer+3))) - || __get_user(tvp, ((struct timeval __user * __user *)(buffer+4)))) + || __get_user(tvp, ((struct __kernel_old_timeval __user * __user *)(buffer+4)))) return -EFAULT; } return sys_select(n, inp, outp, exp, tvp); diff --git a/fs/select.c b/fs/select.c index 53a0c149f528..11d0285d46b7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -321,7 +321,7 @@ static int poll_select_finish(struct timespec64 *end_time, switch (pt_type) { case PT_TIMEVAL: { - struct timeval rtv; + struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); @@ -698,10 +698,10 @@ out_nofds: } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timeval __user *tvp) + fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; - struct timeval tv; + struct __kernel_old_timeval tv; int ret; if (tvp) { @@ -720,7 +720,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, - fd_set __user *, exp, struct timeval __user *, tvp) + fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } @@ -810,7 +810,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; - struct timeval __user *tvp; + struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) diff --git a/fs/utimes.c b/fs/utimes.c index 1ba3f7883870..c952b6b3d8a0 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -161,9 +161,9 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, - struct timeval __user *utimes) + struct __kernel_old_timeval __user *utimes) { - struct timeval times[2]; + struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { @@ -190,13 +190,13 @@ static long do_futimesat(int dfd, const char __user *filename, SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, - struct timeval __user *, utimes) + struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, - struct timeval __user *, utimes) + struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2f27bc9d5ef0..e665920fa359 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -51,7 +51,7 @@ struct statx; struct __sysctl_args; struct sysinfo; struct timespec; -struct timeval; +struct __kernel_old_timeval; struct __kernel_timex; struct timezone; struct tms; @@ -732,7 +732,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache); /* kernel/time.c */ -asmlinkage long sys_gettimeofday(struct timeval __user *tv, +asmlinkage long sys_gettimeofday(struct __kernel_old_timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_settimeofday(struct timeval __user *tv, struct timezone __user *tz); @@ -1082,9 +1082,9 @@ asmlinkage long sys_time32(old_time32_t __user *tloc); asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times); asmlinkage long sys_utimes(char __user *filename, - struct timeval __user *utimes); + struct __kernel_old_timeval __user *utimes); asmlinkage long sys_futimesat(int dfd, const char __user *filename, - struct timeval __user *utimes); + struct __kernel_old_timeval __user *utimes); #endif asmlinkage long sys_futimesat_time32(unsigned int dfd, const char __user *filename, @@ -1098,7 +1098,7 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user *dirent, unsigned int count); asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timeval __user *tvp); + fd_set __user *exp, struct __kernel_old_timeval __user *tvp); asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, int timeout); asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, diff --git a/kernel/power/power.h b/kernel/power/power.h index 44bee462ff57..7cdc64dc2373 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,7 +179,7 @@ extern void swsusp_close(fmode_t); extern int swsusp_unmark(void); #endif -struct timeval; +struct __kernel_old_timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); diff --git a/kernel/time/time.c b/kernel/time/time.c index 833abae3364f..a0e7b9909f2d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME32 */ #endif -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { -- cgit v1.2.3-59-g8ed1b From 5e0fb1b57bea8d11fe77da2bc80f4c9a67e28318 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2018 20:04:11 +0200 Subject: y2038: time: avoid timespec usage in settimeofday() The compat_get_timeval() and timeval_valid() interfaces are deprecated and getting removed along with the definition of struct timeval itself. Change the two implementations of the settimeofday() system call to open-code these helpers and completely avoid references to timeval. The timeval_valid() call is not needed any more here, only a check to avoid overflowing tv_nsec during the multiplication, as there is another range check in do_sys_settimeofday64(). Tested-by: syzbot+dccce9b26ba09ca49966@syzkaller.appspotmail.com Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 2 +- kernel/time/time.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e665920fa359..d0391cc2dae9 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -734,7 +734,7 @@ asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct g /* kernel/time.c */ asmlinkage long sys_gettimeofday(struct __kernel_old_timeval __user *tv, struct timezone __user *tz); -asmlinkage long sys_settimeofday(struct timeval __user *tv, +asmlinkage long sys_settimeofday(struct __kernel_old_timeval __user *tv, struct timezone __user *tz); asmlinkage long sys_adjtimex(struct __kernel_timex __user *txc_p); asmlinkage long sys_adjtimex_time32(struct old_timex32 __user *txc_p); diff --git a/kernel/time/time.c b/kernel/time/time.c index a0e7b9909f2d..58e312e7380f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return 0; } -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) @@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (compat_get_timeval(&user_tv, tv)) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) -- cgit v1.2.3-59-g8ed1b From c1745f84be2657f5702388133551b759b9237f59 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 10:46:22 +0200 Subject: y2038: itimer: compat handling to itimer.c The structure is only used in one place, moving it there simplifies the interface and helps with later changes to this code. Rename it to match the other time32 structures in the process. Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann --- include/linux/compat.h | 15 ++++----------- kernel/compat.c | 24 ------------------------ kernel/time/itimer.c | 42 +++++++++++++++++++++++++++++++++++------- 3 files changed, 39 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 3735a22bfbc0..906a0ea933cd 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -116,14 +116,7 @@ typedef __compat_gid32_t compat_gid_t; struct compat_sel_arg_struct; struct rusage; -struct compat_itimerval { - struct old_timeval32 it_interval; - struct old_timeval32 it_value; -}; - -struct itimerval; -int get_compat_itimerval(struct itimerval *, const struct compat_itimerval __user *); -int put_compat_itimerval(struct compat_itimerval __user *, const struct itimerval *); +struct old_itimerval32; struct compat_tms { compat_clock_t tms_utime; @@ -668,10 +661,10 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, /* kernel/itimer.c */ asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it); + struct old_itimerval32 __user *it); asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out); + struct old_itimerval32 __user *in, + struct old_itimerval32 __user *out); /* kernel/kexec.c */ asmlinkage long compat_sys_kexec_load(compat_ulong_t entry, diff --git a/kernel/compat.c b/kernel/compat.c index a2bc1d6ceb57..95005f849c68 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -90,30 +90,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) } EXPORT_SYMBOL_GPL(compat_put_timespec); -int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) -{ - struct compat_itimerval v32; - - if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) - return -EFAULT; - o->it_interval.tv_sec = v32.it_interval.tv_sec; - o->it_interval.tv_usec = v32.it_interval.tv_usec; - o->it_value.tv_sec = v32.it_value.tv_sec; - o->it_value.tv_usec = v32.it_value.tv_usec; - return 0; -} - -int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) -{ - struct compat_itimerval v32; - - v32.it_interval.tv_sec = i->it_interval.tv_sec; - v32.it_interval.tv_usec = i->it_interval.tv_usec; - v32.it_value.tv_sec = i->it_value.tv_sec; - v32.it_value.tv_usec = i->it_value.tv_usec; - return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; -} - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 77f1e5635cc1..c52ebb40b60b 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -112,19 +112,34 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) } #ifdef CONFIG_COMPAT +struct old_itimerval32 { + struct old_timeval32 it_interval; + struct old_timeval32 it_value; +}; + +static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i) +{ + struct old_itimerval32 v32; + + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_usec; + return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; +} + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) + struct old_itimerval32 __user *, it) { struct itimerval kit; int error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) + if (!error && put_old_itimerval32(it, &kit)) error = -EFAULT; return error; } #endif - /* * The timer is automagically restarted, when interval != 0 */ @@ -310,15 +325,28 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, } #ifdef CONFIG_COMPAT +static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) +{ + struct old_itimerval32 v32; + + if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) + return -EFAULT; + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_usec = v32.it_value.tv_usec; + return 0; +} + COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) + struct old_itimerval32 __user *, in, + struct old_itimerval32 __user *, out) { struct itimerval kin, kout; int error; if (in) { - if (get_compat_itimerval(&kin, in)) + if (get_old_itimerval32(&kin, in)) return -EFAULT; } else { memset(&kin, 0, sizeof(kin)); @@ -327,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, error = do_setitimer(which, &kin, out ? &kout : NULL); if (error || !out) return error; - if (put_compat_itimerval(out, &kout)) + if (put_old_itimerval32(out, &kout)) return -EFAULT; return 0; } -- cgit v1.2.3-59-g8ed1b From 4c22ea2b91203564fdf392b3d3cae249b652a8ae Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 16:59:39 +0200 Subject: y2038: use compat_{get,set}_itimer on alpha The itimer handling for the old alpha osf_setitimer/osf_getitimer system calls is identical to the compat version of getitimer/setitimer, so just use those directly. Signed-off-by: Arnd Bergmann --- arch/alpha/kernel/osf_sys.c | 65 ---------------------------------- arch/alpha/kernel/syscalls/syscall.tbl | 4 +-- kernel/time/itimer.c | 4 +-- 3 files changed, 4 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index bbe7a0da6264..94e4cde8071a 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -971,30 +971,6 @@ put_tv_to_tv32(struct timeval32 __user *o, struct __kernel_old_timeval *i) sizeof(struct timeval32)); } -static inline long -get_it32(struct itimerval *o, struct itimerval32 __user *i) -{ - struct itimerval32 itv; - if (copy_from_user(&itv, i, sizeof(struct itimerval32))) - return -EFAULT; - o->it_interval.tv_sec = itv.it_interval.tv_sec; - o->it_interval.tv_usec = itv.it_interval.tv_usec; - o->it_value.tv_sec = itv.it_value.tv_sec; - o->it_value.tv_usec = itv.it_value.tv_usec; - return 0; -} - -static inline long -put_it32(struct itimerval32 __user *o, struct itimerval *i) -{ - return copy_to_user(o, &(struct itimerval32){ - .it_interval.tv_sec = o->it_interval.tv_sec, - .it_interval.tv_usec = o->it_interval.tv_usec, - .it_value.tv_sec = o->it_value.tv_sec, - .it_value.tv_usec = o->it_value.tv_usec}, - sizeof(struct itimerval32)); -} - static inline void jiffies_to_timeval32(unsigned long jiffies, struct timeval32 *value) { @@ -1039,47 +1015,6 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv, asmlinkage long sys_ni_posix_timers(void); -SYSCALL_DEFINE2(osf_getitimer, int, which, struct itimerval32 __user *, it) -{ - struct itimerval kit; - int error; - - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); - - error = do_getitimer(which, &kit); - if (!error && put_it32(it, &kit)) - error = -EFAULT; - - return error; -} - -SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in, - struct itimerval32 __user *, out) -{ - struct itimerval kin, kout; - int error; - - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); - - if (in) { - if (get_it32(&kin, in)) - return -EFAULT; - } else - memset(&kin, 0, sizeof(kin)); - - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) - return error; - - if (put_it32(out, &kout)) - return -EFAULT; - - return 0; - -} - SYSCALL_DEFINE2(osf_utimes, const char __user *, filename, struct timeval32 __user *, tvs) { diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 728fe028c02c..8e13b0b2928d 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -89,10 +89,10 @@ 80 common setgroups sys_setgroups 81 common osf_old_getpgrp sys_ni_syscall 82 common setpgrp sys_setpgid -83 common osf_setitimer sys_osf_setitimer +83 common osf_setitimer compat_sys_setitimer 84 common osf_old_wait sys_ni_syscall 85 common osf_table sys_ni_syscall -86 common osf_getitimer sys_osf_getitimer +86 common osf_getitimer compat_sys_getitimer 87 common gethostname sys_gethostname 88 common sethostname sys_sethostname 89 common getdtablesize sys_getdtablesize diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index c52ebb40b60b..4664c6addf69 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -111,7 +111,7 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) return error; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) struct old_itimerval32 { struct old_timeval32 it_interval; struct old_timeval32 it_value; @@ -324,7 +324,7 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, return 0; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; -- cgit v1.2.3-59-g8ed1b From ddbc7d0657e9fd38b69f16bd0310703367b52d29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 21:37:43 +0200 Subject: y2038: move itimer reset into itimer.c Preparing for a change to the itimer internals, stop using the do_setitimer() symbol and instead use a new higher-level interface. The do_getitimer()/do_setitimer functions can now be made static, allowing the compiler to potentially produce better object code. Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann --- include/linux/time.h | 9 +++++---- kernel/time/itimer.c | 15 +++++++++++++-- security/selinux/hooks.c | 10 +++------- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index 27d83fd2ae61..0760a4f5a15c 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -35,10 +35,11 @@ extern time64_t mktime64(const unsigned int year, const unsigned int mon, extern u32 (*arch_gettimeoffset)(void); #endif -struct itimerval; -extern int do_setitimer(int which, struct itimerval *value, - struct itimerval *ovalue); -extern int do_getitimer(int which, struct itimerval *value); +#ifdef CONFIG_POSIX_TIMERS +extern void clear_itimer(void); +#else +static inline void clear_itimer(void) {} +#endif extern long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 4664c6addf69..ce9cd19ce72e 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -73,7 +73,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, value->it_interval = ns_to_timeval(interval); } -int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; @@ -197,7 +197,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; @@ -249,6 +249,17 @@ again: return 0; } +#ifdef CONFIG_SECURITY_SELINUX +void clear_itimer(void) +{ + struct itimerval v = {}; + int i; + + for (i = 0; i < 3; i++) + do_setitimer(i, &v, NULL); +} +#endif + #ifdef __ARCH_WANT_SYS_ALARM /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 9625b99e677f..456b5b596e1d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2549,9 +2549,8 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { const struct task_security_struct *tsec = selinux_cred(current_cred()); - struct itimerval itimer; u32 osid, sid; - int rc, i; + int rc; osid = tsec->osid; sid = tsec->sid; @@ -2569,11 +2568,8 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) rc = avc_has_perm(&selinux_state, osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { - memset(&itimer, 0, sizeof itimer); - for (i = 0; i < 3; i++) - do_setitimer(i, &itimer, NULL); - } + clear_itimer(); + spin_lock_irq(¤t->sighand->siglock); if (!fatal_signal_pending(current)) { flush_sigqueue(¤t->pending); -- cgit v1.2.3-59-g8ed1b From bd40a175769d411b2a37e1c087082ac7ee2c15bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Nov 2019 15:27:39 +0100 Subject: y2038: itimer: change implementation to timespec64 There is no 64-bit version of getitimer/setitimer since that is not actually needed. However, the implementation is built around the deprecated 'struct timeval' type. Change the code to use timespec64 internally to reduce the dependencies on timeval and associated helper functions. Minor adjustments in the code are needed to make the native and compat version work the same way, and to keep the range check working after the conversion. Signed-off-by: Arnd Bergmann --- include/trace/events/timer.h | 16 ++--- kernel/time/itimer.c | 158 ++++++++++++++++++++++++++----------------- 2 files changed, 104 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index b7a904825e7d..5998789ed91f 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -303,7 +303,7 @@ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, */ TRACE_EVENT(itimer_state, - TP_PROTO(int which, const struct itimerval *const value, + TP_PROTO(int which, const struct itimerspec64 *const value, unsigned long long expires), TP_ARGS(which, value, expires), @@ -312,24 +312,24 @@ TRACE_EVENT(itimer_state, __field( int, which ) __field( unsigned long long, expires ) __field( long, value_sec ) - __field( long, value_usec ) + __field( long, value_nsec ) __field( long, interval_sec ) - __field( long, interval_usec ) + __field( long, interval_nsec ) ), TP_fast_assign( __entry->which = which; __entry->expires = expires; __entry->value_sec = value->it_value.tv_sec; - __entry->value_usec = value->it_value.tv_usec; + __entry->value_nsec = value->it_value.tv_nsec; __entry->interval_sec = value->it_interval.tv_sec; - __entry->interval_usec = value->it_interval.tv_usec; + __entry->interval_nsec = value->it_interval.tv_nsec; ), - TP_printk("which=%d expires=%llu it_value=%ld.%ld it_interval=%ld.%ld", + TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld", __entry->which, __entry->expires, - __entry->value_sec, __entry->value_usec, - __entry->interval_sec, __entry->interval_usec) + __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC, + __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC) ); /** diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ce9cd19ce72e..5872db9bd5f7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ * Returns the delta between the expiry time and now, which can be * less than zero or 1usec for an pending expired timer */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) +static struct timespec64 itimer_get_remtime(struct hrtimer *timer) { ktime_t rem = __hrtimer_get_remaining(timer, true); @@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) } else rem = 0; - return ktime_to_timeval(rem); + return ktime_to_timespec64(rem); } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *const value) + struct itimerspec64 *const value) { u64 val, interval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -69,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); - value->it_value = ns_to_timeval(val); - value->it_interval = ns_to_timeval(interval); + value->it_value = ns_to_timespec64(val); + value->it_interval = ns_to_timespec64(interval); } -static int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerspec64 *value) { struct task_struct *tsk = current; @@ -82,7 +82,7 @@ static int do_getitimer(int which, struct itimerval *value) spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); + ktime_to_timespec64(tsk->signal->it_real_incr); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: @@ -97,17 +97,26 @@ static int do_getitimer(int which, struct itimerval *value) return 0; } +static int put_itimerval(struct itimerval __user *o, + const struct itimerspec64 *i) +{ + struct itimerval v; + + v.it_interval.tv_sec = i->it_interval.tv_sec; + v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v.it_value.tv_sec = i->it_value.tv_sec; + v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0; +} + + SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) { - int error = -EFAULT; - struct itimerval get_buffer; + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } + if (!error && put_itimerval(value, &get_buffer)) + error = -EFAULT; return error; } @@ -117,24 +126,25 @@ struct old_itimerval32 { struct old_timeval32 it_value; }; -static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i) +static int put_old_itimerval32(struct old_itimerval32 __user *o, + const struct itimerspec64 *i) { struct old_itimerval32 v32; v32.it_interval.tv_sec = i->it_interval.tv_sec; - v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; v32.it_value.tv_sec = i->it_value.tv_sec; - v32.it_value.tv_usec = i->it_value.tv_usec; + v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; } COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct old_itimerval32 __user *, it) + struct old_itimerval32 __user *, value) { - struct itimerval kit; - int error = do_getitimer(which, &kit); + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (!error && put_old_itimerval32(it, &kit)) + if (!error && put_old_itimerval32(value, &get_buffer)) error = -EFAULT; return error; } @@ -156,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - const struct itimerval *const value, - struct itimerval *const ovalue) + const struct itimerspec64 *const value, + struct itimerspec64 *const ovalue) { u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -166,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, * Use the to_ktime conversion because that clamps the maximum * value to KTIME_MAX and avoid multiplication overflows. */ - nval = ktime_to_ns(timeval_to_ktime(value->it_value)); - ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); + nval = timespec64_to_ns(&value->it_value); + ninterval = timespec64_to_ns(&value->it_interval); spin_lock_irq(&tsk->sighand->siglock); @@ -186,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { - ovalue->it_value = ns_to_timeval(oval); - ovalue->it_interval = ns_to_timeval(ointerval); + ovalue->it_value = ns_to_timespec64(oval); + ovalue->it_interval = ns_to_timespec64(ointerval); } } @@ -197,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerspec64 *value, + struct itimerspec64 *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - switch (which) { case ITIMER_REAL: again: @@ -218,7 +222,7 @@ again: if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); + = ktime_to_timespec64(tsk->signal->it_real_incr); } /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { @@ -226,10 +230,10 @@ again: hrtimer_cancel_wait_running(timer); goto again; } - expires = timeval_to_ktime(value->it_value); + expires = timespec64_to_ktime(value->it_value); if (expires != 0) { tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); + timespec64_to_ktime(value->it_interval); hrtimer_start(timer, expires, HRTIMER_MODE_REL); } else tsk->signal->it_real_incr = 0; @@ -252,7 +256,7 @@ again: #ifdef CONFIG_SECURITY_SELINUX void clear_itimer(void) { - struct itimerval v = {}; + struct itimerspec64 v = {}; int i; for (i = 0; i < 3; i++) @@ -276,15 +280,15 @@ void clear_itimer(void) */ static unsigned int alarm_setitimer(unsigned int seconds) { - struct itimerval it_new, it_old; + struct itimerspec64 it_new, it_old; #if BITS_PER_LONG < 64 if (seconds > INT_MAX) seconds = INT_MAX; #endif it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_nsec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0; do_setitimer(ITIMER_REAL, &it_new, &it_old); @@ -292,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds) * We can't return 0 if we have an alarm pending ... And we'd * better return too much than too little anyway */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || + it_old.it_value.tv_nsec >= 500000) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; @@ -310,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif +static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i) +{ + struct itimerval v; + + if (copy_from_user(&v, i, sizeof(struct itimerval))) + return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v.it_value) || + !timeval_valid(&v.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v.it_interval.tv_sec; + o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v.it_value.tv_sec; + o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC; + return 0; +} + SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, struct itimerval __user *, ovalue) { - struct itimerval set_buffer, get_buffer; + struct itimerspec64 set_buffer, get_buffer; int error; if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; + error = get_itimerval(&set_buffer, value); + if (error) + return error; } else { memset(&set_buffer, 0, sizeof(set_buffer)); printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." @@ -330,43 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (error || !ovalue) return error; - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + if (put_itimerval(ovalue, &get_buffer)) return -EFAULT; return 0; } #if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) +static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v32.it_value) || + !timeval_valid(&v32.it_interval)) + return -EINVAL; + o->it_interval.tv_sec = v32.it_interval.tv_sec; - o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC; o->it_value.tv_sec = v32.it_value.tv_sec; - o->it_value.tv_usec = v32.it_value.tv_usec; + o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC; return 0; } COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct old_itimerval32 __user *, in, - struct old_itimerval32 __user *, out) + struct old_itimerval32 __user *, value, + struct old_itimerval32 __user *, ovalue) { - struct itimerval kin, kout; + struct itimerspec64 set_buffer, get_buffer; int error; - if (in) { - if (get_old_itimerval32(&kin, in)) - return -EFAULT; + if (value) { + error = get_old_itimerval32(&set_buffer, value); + if (error) + return error; } else { - memset(&kin, 0, sizeof(kin)); + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); } - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) return error; - if (put_old_itimerval32(out, &kout)) + if (put_old_itimerval32(ovalue, &get_buffer)) return -EFAULT; return 0; } -- cgit v1.2.3-59-g8ed1b From 942437c97fd9ff23a17c13118f50bd0490f6868c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Jul 2019 11:46:10 +0200 Subject: y2038: allow disabling time32 system calls At the moment, the compilation of the old time32 system calls depends purely on the architecture. As systems with new libc based on 64-bit time_t are getting deployed, even architectures that previously supported these (notably x86-32 and arm32 but also many others) no longer depend on them, and removing them from a kernel image results in a smaller kernel binary, the same way we can leave out many other optional system calls. More importantly, on an embedded system that needs to keep working beyond year 2038, any user space program calling these system calls is likely a bug, so removing them from the kernel image does provide an extra debugging help for finding broken applications. I've gone back and forth on hiding this option unless CONFIG_EXPERT is set. This version leaves it visible based on the logic that eventually it will be turned off indefinitely. Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann --- arch/Kconfig | 3 ++- kernel/sys_ni.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 0e1fded2940e..1203955ed4d0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -797,7 +797,8 @@ config COMPAT_OLD_SIGACTION bool config COMPAT_32BIT_TIME - def_bool !64BIT || COMPAT + bool "Provide system calls for 32-bit time_t" + default !64BIT || COMPAT help This enables 32 bit time_t support in addition to 64 bit time_t support. This is relevant on all 32-bit architectures, and 64-bit architectures diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 34b76895b81e..3b69a560a7ac 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -410,6 +410,29 @@ COND_SYSCALL(send); COND_SYSCALL(bdflush); COND_SYSCALL(uselib); +/* optional: time32 */ +COND_SYSCALL(time32); +COND_SYSCALL(stime32); +COND_SYSCALL(utime32); +COND_SYSCALL(adjtimex_time32); +COND_SYSCALL(sched_rr_get_interval_time32); +COND_SYSCALL(nanosleep_time32); +COND_SYSCALL(rt_sigtimedwait_time32); +COND_SYSCALL_COMPAT(rt_sigtimedwait_time32); +COND_SYSCALL(timer_settime32); +COND_SYSCALL(timer_gettime32); +COND_SYSCALL(clock_settime32); +COND_SYSCALL(clock_gettime32); +COND_SYSCALL(clock_getres_time32); +COND_SYSCALL(clock_nanosleep_time32); +COND_SYSCALL(utimes_time32); +COND_SYSCALL(futimesat_time32); +COND_SYSCALL(pselect6_time32); +COND_SYSCALL_COMPAT(pselect6_time32); +COND_SYSCALL(ppoll_time32); +COND_SYSCALL_COMPAT(ppoll_time32); +COND_SYSCALL(utimensat_time32); +COND_SYSCALL(clock_adjtime32); /* * The syscalls below are not found in include/uapi/asm-generic/unistd.h -- cgit v1.2.3-59-g8ed1b From 58a74a2925a5b4125dd4f4e728490b9642534c81 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 15 Nov 2019 11:17:30 +0200 Subject: tracing: Increase SYNTH_FIELDS_MAX for synthetic_events Increase the maximum allowed count of synthetic event fields from 16 to 32 in order to allow for larger-than-usual events. Link: http://lkml.kernel.org/r/20191115091730.9192-1-dedekind1@gmail.com Reviewed-by: Tom Zanussi Signed-off-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..f49d1a36d3ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -23,7 +23,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 16 +#define SYNTH_FIELDS_MAX 32 #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ -- cgit v1.2.3-59-g8ed1b From 1c7f9b673dc0a15753274c4e7f5ebfd4468fc69f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:13:20 -0500 Subject: ftrace: Fix accounting bug with direct->count in register_ftrace_direct() The direct->count wasn't being updated properly, where it only was updated when the first entry was added, but should be updated every time. Fixes: 013bf0da04748 ("ftrace: Add ftrace_find_direct_func()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 834f3556ea1e..32e4e5ffdd97 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5093,8 +5093,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_direct_func_count--; } } else { - if (!direct->count) - direct->count++; + direct->count++; } out_unlock: mutex_unlock(&direct_mutex); -- cgit v1.2.3-59-g8ed1b From 406acdd32d3e7d5a6dcb7f67798e89068fbe0d77 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:19:04 -0500 Subject: ftrace: Add another check for match in register_ftrace_direct() As an instruction pointer passed into register_ftrace_direct() may just exist on the ftrace call site, but may not be the start of the call site itself, register_ftrace_direct() still needs to update test if a direct call exists on the normalized site, as only one direct call is allowed at any one time. Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32e4e5ffdd97..9fe33ebaf914 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5030,7 +5030,12 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; /* Make sure the ip points to the exact record */ - ip = rec->ip; + if (ip != rec->ip) { + ip = rec->ip; + /* Need to check this ip for a direct. */ + if (find_rec_direct(ip)) + goto out_unlock; + } ret = -ENOMEM; if (ftrace_hash_empty(direct_functions) || -- cgit v1.2.3-59-g8ed1b From 128161f47bc3797b0d068da13e311770685d6e4f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:14:45 -0500 Subject: ftrace: Add helper find_direct_entry() to consolidate code Both unregister_ftrace_direct() and modify_ftrace_direct() needs to normalize the ip passed in to match the rec->ip, as it is acceptable to have the ip on the ftrace call site but not the start. There are also common validity checks with the record found by the ip, these should be done for both unregister_ftrace_direct() and modify_ftrace_direct(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9fe33ebaf914..ef79c8393f53 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5112,30 +5112,40 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) { struct ftrace_func_entry *entry; - struct ftrace_direct_func *direct; struct dyn_ftrace *rec; - int ret = -ENODEV; - mutex_lock(&direct_mutex); + rec = lookup_rec(*ip, *ip); + if (!rec) + return NULL; - entry = __ftrace_lookup_ip(direct_functions, ip); + entry = __ftrace_lookup_ip(direct_functions, rec->ip); if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + return NULL; + } - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) { - WARN_ON(rec->flags & FTRACE_FL_DIRECT); - goto out_unlock; - } + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + /* Passed in ip just needs to be on the call site */ + *ip = rec->ip; + + return entry; +} + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_direct_func *direct; + struct ftrace_func_entry *entry; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; if (direct_functions->count == 1) unregister_ftrace_function(&direct_ops); @@ -5187,24 +5197,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = __ftrace_lookup_ip(direct_functions, ip); - if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; - - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) - goto out_unlock; - ip = rec->ip; - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; ret = -EINVAL; if (entry->direct != old_addr) -- cgit v1.2.3-59-g8ed1b From 4a169a95d885fe5c050bac1a21d43c86ba955bcf Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Fri, 15 Nov 2019 15:11:49 -0700 Subject: genirq: Introduce irq_chip_get/set_parent_state calls On certain QTI chipsets some GPIOs are direct-connect interrupts to the GIC to be used as regular interrupt lines. When the GPIOs are not used for interrupt generation the interrupt line is disabled. But disabling the interrupt at GIC does not prevent the interrupt to be reported as pending at GIC_ISPEND. Later, when drivers call enable_irq() on the interrupt, an unwanted interrupt occurs. Introduce get and set methods for irqchip's parent to clear it's pending irq state. This then can be invoked by the GPIO interrupt controller on the parents in it hierarchy to clear the interrupt before enabling the interrupt. Signed-off-by: Maulik Shah Signed-off-by: Lina Iyer Signed-off-by: Marc Zyngier Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/1573855915-9841-7-git-send-email-ilina@codeaurora.org [updated commit text and minor code fixes] --- include/linux/irq.h | 6 ++++++ kernel/irq/chip.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index fb301cf29148..7853eb9301f2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -610,6 +610,12 @@ extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void handle_fasteoi_ack_irq(struct irq_desc *desc); extern void handle_fasteoi_mask_irq(struct irq_desc *desc); +extern int irq_chip_set_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool val); +extern int irq_chip_get_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b76703b2c0af..b3fa2d87d2f3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1297,6 +1297,50 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ +/** + * irq_chip_set_parent_state - set the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_set_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool val) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_set_irqchip_state) + return 0; + + return data->chip->irq_set_irqchip_state(data, which, val); +} +EXPORT_SYMBOL_GPL(irq_chip_set_parent_state); + +/** + * irq_chip_get_parent_state - get the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: one of IRQCHIP_STATE_* the caller wants to know + * @state: a pointer to a boolean where the state is to be stored + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_get_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_get_irqchip_state) + return 0; + + return data->chip->irq_get_irqchip_state(data, which, state); +} +EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) -- cgit v1.2.3-59-g8ed1b From ea806eb3eab35528b578a061b2c4b28f0f92c465 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 17 Nov 2019 17:04:15 -0500 Subject: ftrace: Add a helper function to modify_ftrace_direct() to allow arch optimization If a direct ftrace callback is at a location that does not have any other ftrace helpers attached to it, it is possible to simply just change the text to call the new caller (if the architecture supports it). But this requires special architecture code. Currently, modify_ftrace_direct() uses a trick to add a stub ftrace callback to the location forcing it to call the ftrace iterator. Then it can change the direct helper to call the new function in C, and then remove the stub. Removing the stub will have the location now call the new location that the direct helper is using. The new helper function does the registering the stub trick, but is a weak function, allowing an architecture to override it to do something a bit more direct. Link: https://lore.kernel.org/r/20191115215125.mbqv7taqnx376yed@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 21 ++++++++- kernel/trace/ftrace.c | 120 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 107 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 73eb2e93593f..dfaa37e1943d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -246,12 +246,24 @@ static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } #endif /* CONFIG_FUNCTION_TRACER */ +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; + unsigned long direct; /* for direct lookup only */ +}; + +struct dyn_ftrace; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); +int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr); #else # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) @@ -271,6 +283,13 @@ static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long a { return NULL; } +static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -343,8 +362,6 @@ static inline void stack_tracer_enable(void) { } int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); -struct dyn_ftrace; - enum ftrace_bug_type { FTRACE_BUG_UNKNOWN, FTRACE_BUG_INIT, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef79c8393f53..caae523f4ef3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1020,12 +1020,6 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -struct ftrace_func_entry { - struct hlist_node hlist; - unsigned long ip; - unsigned long direct; /* for direct lookup only */ -}; - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -5112,7 +5106,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip, + struct dyn_ftrace **recp) { struct ftrace_func_entry *entry; struct dyn_ftrace *rec; @@ -5132,6 +5127,9 @@ static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) /* Passed in ip just needs to be on the call site */ *ip = rec->ip; + if (recp) + *recp = rec; + return entry; } @@ -5143,7 +5141,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5179,6 +5177,75 @@ static struct ftrace_ops stub_ops = { .func = ftrace_stub, }; +/** + * ftrace_modify_direct_caller - modify ftrace nop directly + * @entry: The ftrace hash entry of the direct helper for @rec + * @rec: The record representing the function site to patch + * @old_addr: The location that the site at @rec->ip currently calls + * @new_addr: The location that the site at @rec->ip should call + * + * An architecture may overwrite this function to optimize the + * changing of the direct callback on an ftrace nop location. + * This is called with the ftrace_lock mutex held, and no other + * ftrace callbacks are on the associated record (@rec). Thus, + * it is safe to modify the ftrace record, where it should be + * currently calling @old_addr directly, to call @new_addr. + * + * Safety checks should be made to make sure that the code at + * @rec->ip is currently calling @old_addr. And this must + * also update entry->direct to @new_addr. + */ +int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + unsigned long ip = rec->ip; + int ret; + + /* + * The ftrace_lock was used to determine if the record + * had more than one registered user to it. If it did, + * we needed to prevent that from changing to do the quick + * switch. But if it did not (only a direct caller was attached) + * then this function is called. But this function can deal + * with attached callers to the rec that we care about, and + * since this function uses standard ftrace calls that take + * the ftrace_lock mutex, we need to release it. + */ + mutex_unlock(&ftrace_lock); + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_lock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_lock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + out_lock: + mutex_lock(&ftrace_lock); + + return ret; +} + /** * modify_ftrace_direct - Modify an existing direct call to call something else * @ip: The instruction pointer to modify @@ -5197,11 +5264,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + mutex_lock(&ftrace_lock); + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5210,33 +5279,20 @@ int modify_ftrace_direct(unsigned long ip, goto out_unlock; /* - * By setting a stub function at the same address, we force - * the code to call the iterator and the direct_ops helper. - * This means that @ip does not call the direct call, and - * we can simply modify it. + * If there's no other ftrace callback on the rec->ip location, + * then it can be changed directly by the architecture. + * If there is another caller, then we just need to change the + * direct caller helper to point to @new_addr. */ - ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); - if (ret) - goto out_unlock; - - ret = register_ftrace_function(&stub_ops); - if (ret) { - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - goto out_unlock; + if (ftrace_rec_count(rec) == 1) { + ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr); + } else { + entry->direct = new_addr; + ret = 0; } - entry->direct = new_addr; - - /* - * By removing the stub, we put back the direct call, calling - * the @new_addr. - */ - unregister_ftrace_function(&stub_ops); - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - - ret = 0; - out_unlock: + mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); return ret; } -- cgit v1.2.3-59-g8ed1b From 46f9469247c6f4697cbbf37e4b3961120bf07f29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Nov 2019 10:41:29 -0500 Subject: ftrace: Rename ftrace_graph_stub to ftrace_stub_graph The ftrace_graph_stub was created and points to ftrace_stub as a way to assign the functon graph tracer function pointer to a stub function with a different prototype than what ftrace_stub has and not trigger the C verifier. The ftrace_graph_stub was created via the linker script vmlinux.lds.h. Unfortunately, powerpc already uses the name ftrace_graph_stub for its internal implementation of the function graph tracer, and even though powerpc would still build, the change via the linker script broke function tracer on powerpc from working. By using the name ftrace_stub_graph, which does not exist anywhere else in the kernel, this should not be a problem. Link: https://lore.kernel.org/r/1573849732.5937.136.camel@lca.pw Fixes: b83b43ffc6e4 ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub") Reorted-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 8 ++++---- kernel/trace/fgraph.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0f358be551cd..996db32c491b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -112,7 +112,7 @@ #ifdef CONFIG_FTRACE_MCOUNT_RECORD #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY /* - * Need to also make ftrace_graph_stub point to ftrace_stub + * Need to also make ftrace_stub_graph point to ftrace_stub * so that the same stub location may have different protocols * and not mess up with C verifiers. */ @@ -120,17 +120,17 @@ __start_mcount_loc = .; \ KEEP(*(__patchable_function_entries)) \ __stop_mcount_loc = .; \ - ftrace_graph_stub = ftrace_stub; + ftrace_stub_graph = ftrace_stub; #else #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ __stop_mcount_loc = .; \ - ftrace_graph_stub = ftrace_stub; + ftrace_stub_graph = ftrace_stub; #endif #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_graph_stub = ftrace_stub; +# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; # else # define MCOUNT_REC() # endif diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fa3ce10d0405..67e0c462b059 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -336,10 +336,10 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -extern void ftrace_graph_stub(struct ftrace_graph_ret *); +extern void ftrace_stub_graph(struct ftrace_graph_ret *); /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -619,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = ftrace_graph_stub; + ftrace_graph_return = ftrace_stub_graph; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit v1.2.3-59-g8ed1b From c55b51a06b01d67a99457bb82a8c31081c7faa23 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:12 +0100 Subject: cpuidle: Allow idle injection to apply exit latency limit In some cases it may be useful to specify an exit latency limit for the idle state to be used during CPU idle time injection. Instead of duplicating the information in struct cpuidle_device or propagating the latency limit in the call stack, replace the use_deepest_state field with forced_latency_limit_ns to represent that limit, so that the deepest idle state with exit latency within that limit is forced (i.e. no governors) when it is set. A zero exit latency limit for forced idle means to use governors in the usual way (analogous to use_deepest_state equal to "false" before this change). Additionally, add play_idle_precise() taking two arguments, the duration of forced idle and the idle state exit latency limit, both in nanoseconds, and redefine play_idle() as a wrapper around that new function. This change is preparatory, no functional impact is expected. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Subject, changelog, cpuidle_use_deepest_state() kerneldoc, whitespace ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 13 +++++++------ include/linux/cpu.h | 7 ++++++- include/linux/cpuidle.h | 6 +++--- kernel/sched/idle.c | 14 +++++++------- 4 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index bf9b030cd7e1..12077db1158e 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -99,20 +99,21 @@ static int find_deepest_state(struct cpuidle_driver *drv, } /** - * cpuidle_use_deepest_state - Set/clear governor override flag. - * @enable: New value of the flag. + * cpuidle_use_deepest_state - Set/unset governor override mode. + * @latency_limit_ns: Idle state exit latency limit (or no override if 0). * - * Set/unset the current CPU to use the deepest idle state (override governors - * going forward if set). + * If @latency_limit_ns is nonzero, set the current CPU to use the deepest idle + * state with exit latency within @latency_limit_ns (override governors going + * forward), or do not override governors if it is zero. */ -void cpuidle_use_deepest_state(bool enable) +void cpuidle_use_deepest_state(u64 latency_limit_ns) { struct cpuidle_device *dev; preempt_disable(); dev = cpuidle_get_device(); if (dev) - dev->use_deepest_state = enable; + dev->forced_idle_latency_limit_ns = latency_limit_ns; preempt_enable(); } diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d0633ebdaa9c..cc03a7848b63 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -179,7 +179,12 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); -void play_idle(unsigned long duration_us); +void play_idle_precise(u64 duration_ns, u64 latency_ns); + +static inline void play_idle(unsigned long duration_us) +{ + play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX); +} #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index afb6a573b46d..72b26ff1de4b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -85,7 +85,6 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; - unsigned int use_deepest_state:1; unsigned int poll_time_limit:1; unsigned int cpu; ktime_t next_hrtimer; @@ -93,6 +92,7 @@ struct cpuidle_device { int last_state_idx; u64 last_residency_ns; u64 poll_limit_ns; + u64 forced_idle_latency_limit_ns; struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX]; struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX]; struct cpuidle_driver_kobj *kobj_driver; @@ -216,7 +216,7 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev); -extern void cpuidle_use_deepest_state(bool enable); +extern void cpuidle_use_deepest_state(u64 latency_limit_ns); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -224,7 +224,7 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } -static inline void cpuidle_use_deepest_state(bool enable) +static inline void cpuidle_use_deepest_state(u64 latency_limit_ns) { } #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1aa260702b38..cd05ffa0abfe 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -165,7 +165,7 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ - if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -void play_idle(unsigned long duration_us) +void play_idle_precise(u64 duration_ns, u64 latency_ns) { struct idle_timer it; @@ -323,29 +323,29 @@ void play_idle(unsigned long duration_us) WARN_ON_ONCE(current->nr_cpus_allowed != 1); WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); - WARN_ON_ONCE(!duration_us); + WARN_ON_ONCE(!duration_ns); rcu_sleep_check(); preempt_disable(); current->flags |= PF_IDLE; - cpuidle_use_deepest_state(true); + cpuidle_use_deepest_state(latency_ns); it.done = 0; hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); it.timer.function = idle_inject_timer_fn; - hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC), + hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED); while (!READ_ONCE(it.done)) do_idle(); - cpuidle_use_deepest_state(false); + cpuidle_use_deepest_state(0); current->flags &= ~PF_IDLE; preempt_fold_need_resched(); preempt_enable(); } -EXPORT_SYMBOL_GPL(play_idle); +EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { -- cgit v1.2.3-59-g8ed1b From 5aa9ba6312e36c18626e73506b92d1513d815435 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:13 +0100 Subject: cpuidle: Pass exit latency limit to cpuidle_use_deepest_state() Modify cpuidle_use_deepest_state() to take an additional exit latency limit argument to be passed to find_deepest_idle_state() and make cpuidle_idle_call() pass dev->forced_idle_latency_limit_ns to it for forced idle. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Rebase and rearrange code, subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 5 +++-- include/linux/cpuidle.h | 6 ++++-- kernel/sched/idle.c | 8 +++++++- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 12077db1158e..569dbac443bd 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -123,9 +123,10 @@ void cpuidle_use_deepest_state(u64 latency_limit_ns) * @dev: cpuidle device for the given CPU. */ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 latency_limit_ns) { - return find_deepest_state(drv, dev, U64_MAX, 0, false); + return find_deepest_state(drv, dev, latency_limit_ns, 0, false); } #ifdef CONFIG_SUSPEND diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 72b26ff1de4b..2dbe46b7c213 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -213,13 +213,15 @@ static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + u64 latency_limit_ns); extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern void cpuidle_use_deepest_state(u64 latency_limit_ns); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 latency_limit_ns) {return -ENODEV; } static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cd05ffa0abfe..fc9604ddd802 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -166,6 +166,8 @@ static void cpuidle_idle_call(void) */ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { + u64 max_latency_ns; + if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -176,12 +178,16 @@ static void cpuidle_idle_call(void) } rcu_idle_exit(); + + max_latency_ns = U64_MAX; + } else { + max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); rcu_idle_enter(); - next_state = cpuidle_find_deepest_state(drv, dev); + next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); } else { bool stop_tick = true; -- cgit v1.2.3-59-g8ed1b From 56e35f9c5b87ec1ae93e483284e189c84388de16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Nov 2019 18:03:11 +0100 Subject: dma-mapping: drop the dev argument to arch_sync_dma_for_* These are pure cache maintainance routines, so drop the unused struct device argument. Signed-off-by: Christoph Hellwig Suggested-by: Daniel Vetter --- arch/arc/mm/dma.c | 8 ++++---- arch/arm/mm/dma-mapping.c | 8 ++++---- arch/arm/xen/mm.c | 12 ++++++------ arch/arm64/mm/dma-mapping.c | 8 ++++---- arch/c6x/mm/dma-coherent.c | 14 +++++++------- arch/csky/mm/dma-mapping.c | 8 ++++---- arch/hexagon/kernel/dma.c | 4 ++-- arch/ia64/mm/init.c | 4 ++-- arch/m68k/kernel/dma.c | 4 ++-- arch/microblaze/kernel/dma.c | 14 +++++++------- arch/mips/bmips/dma.c | 2 +- arch/mips/jazz/jazzdma.c | 17 ++++++++--------- arch/mips/mm/dma-noncoherent.c | 12 ++++++------ arch/nds32/kernel/dma.c | 8 ++++---- arch/nios2/mm/dma-mapping.c | 8 ++++---- arch/openrisc/kernel/dma.c | 2 +- arch/parisc/kernel/pci-dma.c | 8 ++++---- arch/powerpc/mm/dma-noncoherent.c | 8 ++++---- arch/sh/kernel/dma-coherent.c | 6 +++--- arch/sparc/kernel/ioport.c | 4 ++-- arch/xtensa/kernel/pci-dma.c | 8 ++++---- drivers/iommu/dma-iommu.c | 10 +++++----- drivers/xen/swiotlb-xen.c | 8 ++++---- include/linux/dma-noncoherent.h | 20 ++++++++++---------- include/xen/swiotlb-xen.h | 8 ++++---- kernel/dma/direct.c | 14 +++++++------- 26 files changed, 113 insertions(+), 114 deletions(-) (limited to 'kernel') diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 73a7e88a1e92..e947572a521e 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -48,8 +48,8 @@ void arch_dma_prep_coherent(struct page *page, size_t size) * upper layer functions (in include/linux/dma-mapping.h) */ -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_TO_DEVICE: @@ -69,8 +69,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, } } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_TO_DEVICE: diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index f3cbeba7f9cb..da1a32b5e192 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2332,15 +2332,15 @@ void arch_teardown_dma_ops(struct device *dev) } #ifdef CONFIG_SWIOTLB -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_page_cpu_to_dev(phys_to_page(paddr), paddr & (PAGE_SIZE - 1), size, dir); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_page_dev_to_cpu(phys_to_page(paddr), paddr & (PAGE_SIZE - 1), size, dir); diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 38fa917c8585..a6a2514e5fe8 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -70,20 +70,20 @@ static void dma_cache_maint(dma_addr_t handle, size_t size, u32 op) * pfn_valid returns true the pages is local and we can use the native * dma-direct functions, otherwise we call the Xen specific version. */ -void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle, - phys_addr_t paddr, size_t size, enum dma_data_direction dir) +void xen_dma_sync_for_cpu(dma_addr_t handle, phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { if (pfn_valid(PFN_DOWN(handle))) - arch_sync_dma_for_cpu(dev, paddr, size, dir); + arch_sync_dma_for_cpu(paddr, size, dir); else if (dir != DMA_TO_DEVICE) dma_cache_maint(handle, size, GNTTAB_CACHE_INVAL); } -void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle, - phys_addr_t paddr, size_t size, enum dma_data_direction dir) +void xen_dma_sync_for_device(dma_addr_t handle, phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { if (pfn_valid(PFN_DOWN(handle))) - arch_sync_dma_for_device(dev, paddr, size, dir); + arch_sync_dma_for_device(paddr, size, dir); else if (dir == DMA_FROM_DEVICE) dma_cache_maint(handle, size, GNTTAB_CACHE_INVAL); else diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 9239416e93d4..6c45350e33aa 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -13,14 +13,14 @@ #include -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_map_area(phys_to_virt(paddr), size, dir); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_unmap_area(phys_to_virt(paddr), size, dir); } diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c index b319808e8f6b..a5909091cb14 100644 --- a/arch/c6x/mm/dma-coherent.c +++ b/arch/c6x/mm/dma-coherent.c @@ -140,7 +140,7 @@ void __init coherent_mem_init(phys_addr_t start, u32 size) sizeof(long)); } -static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size, +static void c6x_dma_sync(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { BUG_ON(!valid_dma_direction(dir)); @@ -160,14 +160,14 @@ static void c6x_dma_sync(struct device *dev, phys_addr_t paddr, size_t size, } } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { - return c6x_dma_sync(dev, paddr, size, dir); + return c6x_dma_sync(paddr, size, dir); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { - return c6x_dma_sync(dev, paddr, size, dir); + return c6x_dma_sync(paddr, size, dir); } diff --git a/arch/csky/mm/dma-mapping.c b/arch/csky/mm/dma-mapping.c index 06e85b565454..8f6571ae27c8 100644 --- a/arch/csky/mm/dma-mapping.c +++ b/arch/csky/mm/dma-mapping.c @@ -58,8 +58,8 @@ void arch_dma_prep_coherent(struct page *page, size_t size) cache_op(page_to_phys(page), size, dma_wbinv_set_zero_range); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_TO_DEVICE: @@ -74,8 +74,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, } } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_TO_DEVICE: diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c index f561b127c4b4..25f388d9cfcc 100644 --- a/arch/hexagon/kernel/dma.c +++ b/arch/hexagon/kernel/dma.c @@ -55,8 +55,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, gen_pool_free(coherent_pool, (unsigned long) vaddr, size); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { void *addr = phys_to_virt(paddr); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index bf9df2625bc8..58fd67068bac 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -73,8 +73,8 @@ __ia64_sync_icache_dcache (pte_t pte) * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { unsigned long pfn = PHYS_PFN(paddr); diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index 3fab684cc0db..871a0e11da34 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c @@ -61,8 +61,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, #endif /* CONFIG_MMU && !CONFIG_COLDFIRE */ -void arch_sync_dma_for_device(struct device *dev, phys_addr_t handle, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t handle, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_BIDIRECTIONAL: diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index a89c2d4ed5ff..d7bebd04247b 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c @@ -15,7 +15,7 @@ #include #include -static void __dma_sync(struct device *dev, phys_addr_t paddr, size_t size, +static void __dma_sync(phys_addr_t paddr, size_t size, enum dma_data_direction direction) { switch (direction) { @@ -31,14 +31,14 @@ static void __dma_sync(struct device *dev, phys_addr_t paddr, size_t size, } } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { - __dma_sync(dev, paddr, size, dir); + __dma_sync(paddr, size, dir); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { - __dma_sync(dev, paddr, size, dir); + __dma_sync(paddr, size, dir); } diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c index 3d13c77c125f..df56bf4179e3 100644 --- a/arch/mips/bmips/dma.c +++ b/arch/mips/bmips/dma.c @@ -64,7 +64,7 @@ phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t dma_addr) return dma_addr; } -void arch_sync_dma_for_cpu_all(struct device *dev) +void arch_sync_dma_for_cpu_all(void) { void __iomem *cbr = BMIPS_GET_CBR(); u32 cfg; diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index a01e14955187..c64a297e82b3 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -592,7 +592,7 @@ static dma_addr_t jazz_dma_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, phys, size, dir); + arch_sync_dma_for_device(phys, size, dir); return vdma_alloc(phys, size); } @@ -600,7 +600,7 @@ static void jazz_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_cpu(dev, vdma_log2phys(dma_addr), size, dir); + arch_sync_dma_for_cpu(vdma_log2phys(dma_addr), size, dir); vdma_free(dma_addr); } @@ -612,7 +612,7 @@ static int jazz_dma_map_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, sg, nents, i) { if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, + arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); sg->dma_address = vdma_alloc(sg_phys(sg), sg->length); if (sg->dma_address == DMA_MAPPING_ERROR) @@ -631,8 +631,7 @@ static void jazz_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, sg, nents, i) { if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, - dir); + arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); vdma_free(sg->dma_address); } } @@ -640,13 +639,13 @@ static void jazz_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, static void jazz_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { - arch_sync_dma_for_device(dev, vdma_log2phys(addr), size, dir); + arch_sync_dma_for_device(vdma_log2phys(addr), size, dir); } static void jazz_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { - arch_sync_dma_for_cpu(dev, vdma_log2phys(addr), size, dir); + arch_sync_dma_for_cpu(vdma_log2phys(addr), size, dir); } static void jazz_dma_sync_sg_for_device(struct device *dev, @@ -656,7 +655,7 @@ static void jazz_dma_sync_sg_for_device(struct device *dev, int i; for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); } static void jazz_dma_sync_sg_for_cpu(struct device *dev, @@ -666,7 +665,7 @@ static void jazz_dma_sync_sg_for_cpu(struct device *dev, int i; for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); } const struct dma_map_ops jazz_dma_ops = { diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index fcf6d3eaac66..dc42ffc83825 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -27,7 +27,7 @@ * R10000 and R12000 are used in such systems, the SGI IP28 Indigo² rsp. * SGI IP32 aka O2. */ -static inline bool cpu_needs_post_dma_flush(struct device *dev) +static inline bool cpu_needs_post_dma_flush(void) { switch (boot_cpu_type()) { case CPU_R10000: @@ -112,17 +112,17 @@ static inline void dma_sync_phys(phys_addr_t paddr, size_t size, } while (left); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { dma_sync_phys(paddr, size, dir); } #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { - if (cpu_needs_post_dma_flush(dev)) + if (cpu_needs_post_dma_flush()) dma_sync_phys(paddr, size, dir); } #endif diff --git a/arch/nds32/kernel/dma.c b/arch/nds32/kernel/dma.c index 4206d4b6c8ce..69d762182d49 100644 --- a/arch/nds32/kernel/dma.c +++ b/arch/nds32/kernel/dma.c @@ -46,8 +46,8 @@ static inline void cache_op(phys_addr_t paddr, size_t size, } while (left); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_FROM_DEVICE: @@ -61,8 +61,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, } } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_TO_DEVICE: diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c index 9cb238664584..0ed711e37902 100644 --- a/arch/nios2/mm/dma-mapping.c +++ b/arch/nios2/mm/dma-mapping.c @@ -18,8 +18,8 @@ #include #include -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { void *vaddr = phys_to_virt(paddr); @@ -42,8 +42,8 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, } } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { void *vaddr = phys_to_virt(paddr); diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 4d5b8bd1d795..adec711ad39d 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -125,7 +125,7 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr, free_pages_exact(vaddr, size); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t addr, size_t size, +void arch_sync_dma_for_device(phys_addr_t addr, size_t size, enum dma_data_direction dir) { unsigned long cl; diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ca35d9a76e50..a60d47fd4d55 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -439,14 +439,14 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)__va(dma_handle), order); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { flush_kernel_dcache_range((unsigned long)phys_to_virt(paddr), size); } diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 2a82984356f8..5ab4f868e919 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -104,14 +104,14 @@ static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir) #endif } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_sync_page(paddr, size, dir); } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { __dma_sync_page(paddr, size, dir); } diff --git a/arch/sh/kernel/dma-coherent.c b/arch/sh/kernel/dma-coherent.c index b17514619b7e..eeb25a4fa55f 100644 --- a/arch/sh/kernel/dma-coherent.c +++ b/arch/sh/kernel/dma-coherent.c @@ -25,7 +25,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * Pages from the page allocator may have data present in * cache. So flush the cache before using uncached memory. */ - arch_sync_dma_for_device(dev, virt_to_phys(ret), size, + arch_sync_dma_for_device(virt_to_phys(ret), size, DMA_BIDIRECTIONAL); ret_nocache = (void __force *)ioremap_nocache(virt_to_phys(ret), size); @@ -59,8 +59,8 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, iounmap(vaddr); } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { void *addr = sh_cacheop_vaddr(phys_to_virt(paddr)); diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index f89603855f1e..e59461d03b9a 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -366,8 +366,8 @@ void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, /* IIep is write-through, not flushing on cpu to device transfer. */ -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { if (dir != PCI_DMA_TODEVICE) dma_make_coherent(paddr, PAGE_ALIGN(size)); diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 1c82e21de4f6..72b6222daa0b 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -44,8 +44,8 @@ static void do_cache_op(phys_addr_t paddr, size_t size, } } -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_BIDIRECTIONAL: @@ -62,8 +62,8 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, } } -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir) +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { switch (dir) { case DMA_BIDIRECTIONAL: diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f321279baf9e..0fa8c1d818b7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -659,7 +659,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - arch_sync_dma_for_cpu(dev, phys, size, dir); + arch_sync_dma_for_cpu(phys, size, dir); } static void iommu_dma_sync_single_for_device(struct device *dev, @@ -671,7 +671,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - arch_sync_dma_for_device(dev, phys, size, dir); + arch_sync_dma_for_device(phys, size, dir); } static void iommu_dma_sync_sg_for_cpu(struct device *dev, @@ -685,7 +685,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, return; for_each_sg(sgl, sg, nelems, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); } static void iommu_dma_sync_sg_for_device(struct device *dev, @@ -699,7 +699,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, return; for_each_sg(sgl, sg, nelems, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); } static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, @@ -714,7 +714,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, dma_handle =__iommu_dma_map(dev, phys, size, prot); if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && dma_handle != DMA_MAPPING_ERROR) - arch_sync_dma_for_device(dev, phys, size, dir); + arch_sync_dma_for_device(phys, size, dir); return dma_handle; } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index bd3a10dfac15..3f8b2cdb4acb 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -405,7 +405,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, done: if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - xen_dma_sync_for_device(dev, dev_addr, phys, size, dir); + xen_dma_sync_for_device(dev_addr, phys, size, dir); return dev_addr; } @@ -425,7 +425,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - xen_dma_sync_for_cpu(hwdev, dev_addr, paddr, size, dir); + xen_dma_sync_for_cpu(dev_addr, paddr, size, dir); /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) @@ -439,7 +439,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, phys_addr_t paddr = xen_bus_to_phys(dma_addr); if (!dev_is_dma_coherent(dev)) - xen_dma_sync_for_cpu(dev, dma_addr, paddr, size, dir); + xen_dma_sync_for_cpu(dma_addr, paddr, size, dir); if (is_xen_swiotlb_buffer(dma_addr)) swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); @@ -455,7 +455,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - xen_dma_sync_for_device(dev, dma_addr, paddr, size, dir); + xen_dma_sync_for_device(dma_addr, paddr, size, dir); } /* diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h index e30fca1f1b12..ca9b5770caee 100644 --- a/include/linux/dma-noncoherent.h +++ b/include/linux/dma-noncoherent.h @@ -73,29 +73,29 @@ static inline void arch_dma_cache_sync(struct device *dev, void *vaddr, #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */ #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE -void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir); +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir); #else -static inline void arch_sync_dma_for_device(struct device *dev, - phys_addr_t paddr, size_t size, enum dma_data_direction dir) +static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { } #endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */ #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU -void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir); +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir); #else -static inline void arch_sync_dma_for_cpu(struct device *dev, - phys_addr_t paddr, size_t size, enum dma_data_direction dir) +static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) { } #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL -void arch_sync_dma_for_cpu_all(struct device *dev); +void arch_sync_dma_for_cpu_all(void); #else -static inline void arch_sync_dma_for_cpu_all(struct device *dev) +static inline void arch_sync_dma_for_cpu_all(void) { } #endif /* CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL */ diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index d71380f6ed0b..ffc0d3902b71 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -4,10 +4,10 @@ #include -void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle, - phys_addr_t paddr, size_t size, enum dma_data_direction dir); -void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle, - phys_addr_t paddr, size_t size, enum dma_data_direction dir); +void xen_dma_sync_for_cpu(dma_addr_t handle, phys_addr_t paddr, size_t size, + enum dma_data_direction dir); +void xen_dma_sync_for_device(dma_addr_t handle, phys_addr_t paddr, size_t size, + enum dma_data_direction dir); extern int xen_swiotlb_init(int verbose, bool early); extern const struct dma_map_ops xen_swiotlb_dma_ops; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 22a2e0833862..077876ae5c74 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -268,7 +268,7 @@ void dma_direct_sync_single_for_device(struct device *dev, swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, paddr, size, dir); + arch_sync_dma_for_device(paddr, size, dir); } EXPORT_SYMBOL(dma_direct_sync_single_for_device); @@ -286,7 +286,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, paddr, sg->length, + arch_sync_dma_for_device(paddr, sg->length, dir); } } @@ -302,8 +302,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, addr); if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(dev, paddr, size, dir); - arch_sync_dma_for_cpu_all(dev); + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); } if (unlikely(is_swiotlb_buffer(paddr))) @@ -321,7 +321,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu(dev, paddr, sg->length, dir); + arch_sync_dma_for_cpu(paddr, sg->length, dir); if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, @@ -329,7 +329,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, } if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu_all(dev); + arch_sync_dma_for_cpu_all(); } EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); @@ -380,7 +380,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, phys, size, dir); + arch_sync_dma_for_device(phys, size, dir); return dma_addr; } EXPORT_SYMBOL(dma_direct_map_page); -- cgit v1.2.3-59-g8ed1b From 50f579a2399dee0ad1c86ea8159ab8657b74f95b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Nov 2019 09:18:19 +0300 Subject: dma-debug: clean up put_hash_bucket() The put_hash_bucket() is a bit cleaner if it takes an unsigned long directly instead of a pointer to unsigned long. Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 004496654aaa..64972aa9bfc3 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -255,12 +255,10 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, * Give up exclusive access to the hash bucket */ static void put_hash_bucket(struct hash_bucket *bucket, - unsigned long *flags) + unsigned long flags) __releases(&bucket->lock) { - unsigned long __flags = *flags; - - spin_unlock_irqrestore(&bucket->lock, __flags); + spin_unlock_irqrestore(&bucket->lock, flags); } static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) @@ -359,7 +357,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, /* * Nothing found, go back a hash bucket */ - put_hash_bucket(*bucket, flags); + put_hash_bucket(*bucket, *flags); range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); @@ -609,7 +607,7 @@ static void add_dma_entry(struct dma_debug_entry *entry) bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { @@ -1002,7 +1000,7 @@ static void check_unmap(struct dma_debug_entry *ref) if (!entry) { /* must drop lock before calling dma_mapping_error */ - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); if (dma_mapping_error(ref->dev, ref->dev_addr)) { err_printk(ref->dev, NULL, @@ -1084,7 +1082,7 @@ static void check_unmap(struct dma_debug_entry *ref) hash_bucket_del(entry); dma_entry_free(entry); - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } static void check_for_stack(struct device *dev, @@ -1204,7 +1202,7 @@ static void check_sync(struct device *dev, } out: - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } static void check_sg_segment(struct device *dev, struct scatterlist *sg) @@ -1319,7 +1317,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } } - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } EXPORT_SYMBOL(debug_dma_mapping_error); @@ -1392,7 +1390,7 @@ static int get_nr_mapped_entries(struct device *dev, if (entry) mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); return mapped_ents; } -- cgit v1.2.3-59-g8ed1b From 4268ac6ae5870af10a7417b22990d615f72f77e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2019 17:35:36 +0100 Subject: dma-direct: don't check swiotlb=force in dma_direct_map_resource When mapping resources we can't just use swiotlb ram for bounce buffering. Switch to a direct dma_capable check instead. Fixes: cfced786969c ("dma-mapping: remove the default map_resource implementation") Reported-by: Robin Murphy Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 077876ae5c74..a479bd2d1e8b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, { dma_addr_t dma_addr = paddr; - if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + if (unlikely(!dma_capable(dev, dma_addr, size))) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } -- cgit v1.2.3-59-g8ed1b From 68a33b1794665ba8a1d1ef1d3bfcc7c587d380a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2019 17:38:58 +0100 Subject: dma-direct: exclude dma_direct_map_resource from the min_low_pfn check The valid memory address check in dma_capable only makes sense when mapping normal memory, not when using dma_map_resource to map a device resource. Add a new boolean argument to dma_capable to exclude that check for the dma_map_resource case. Fixes: b12d66278dd6 ("dma-direct: check for overflows on 32 bit DMA addresses") Reported-by: Marek Szyprowski Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski --- arch/x86/kernel/amd_gart_64.c | 4 ++-- drivers/xen/swiotlb-xen.c | 4 ++-- include/linux/dma-direct.h | 5 +++-- kernel/dma/direct.c | 4 ++-- kernel/dma/swiotlb.c | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a6ac3712db8b..5cfab41e8509 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -185,13 +185,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || !dma_capable(dev, addr, size); + return force_iommu || !dma_capable(dev, addr, size, true); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !dma_capable(dev, addr, size); + return !dma_capable(dev, addr, size, true); } /* Map a single continuous physical area into the IOMMU. diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 3f8b2cdb4acb..b6d27762c6f8 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -375,7 +375,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && + if (dma_capable(dev, dev_addr, size, true) && !range_straddles_page_boundary(phys, size) && !xen_arch_need_swiotlb(dev, phys, dev_addr) && swiotlb_force != SWIOTLB_FORCE) @@ -397,7 +397,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, /* * Ensure that the address returned is DMA'ble */ - if (unlikely(!dma_capable(dev, dev_addr, size))) { + if (unlikely(!dma_capable(dev, dev_addr, size, true))) { swiotlb_tbl_unmap_single(dev, map, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return DMA_MAPPING_ERROR; diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index f8959f75e496..99b77dd5f79b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -49,14 +49,15 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) return __sme_clr(__dma_to_phys(dev, daddr)); } -static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size, + bool is_ram) { dma_addr_t end = addr + size - 1; if (!dev->dma_mask) return false; - if (!IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && + if (is_ram && !IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn))) return false; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a479bd2d1e8b..40f1f0aac4b1 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -363,7 +363,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size); + dma_capable(dev, dma_addr, size, true); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, @@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, { dma_addr_t dma_addr = paddr; - if (unlikely(!dma_capable(dev, dma_addr, size))) { + if (unlikely(!dma_capable(dev, dma_addr, size, false))) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 673a2cdb2656..9280d6f8271e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -678,7 +678,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, /* Ensure that the address returned is DMA'ble */ *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size))) { + if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return false; -- cgit v1.2.3-59-g8ed1b From 7b8474466ed97be458c825f34a85f2c2b84c3f95 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 21 Nov 2019 00:03:03 +0000 Subject: time: Zero the upper 32-bits in __kernel_timespec on 32-bit On compat interfaces, the high order bits of nanoseconds should be zeroed out. This is because the application code or the libc do not guarantee zeroing of these. If used without zeroing, kernel might be at risk of using timespec values incorrectly. Originally it was handled correctly, but lost during is_compat_syscall() cleanup. Revert the condition back to check CONFIG_64BIT. Fixes: 98f76206b335 ("compat: Cleanup in_compat_syscall() callers") Reported-by: Ben Hutchings Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191121000303.126523-1-dima@arista.com --- kernel/time/time.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..83f403e7a15c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -881,7 +881,8 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || + in_compat_syscall())) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; -- cgit v1.2.3-59-g8ed1b From a7ba70f1787f977f970cd116076c6fce4b9e01cc Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Thu, 21 Nov 2019 10:26:44 +0100 Subject: dma-mapping: treat dev->bus_dma_mask as a DMA limit Using a mask to represent bus DMA constraints has a set of limitations. The biggest one being it can only hold a power of two (minus one). The DMA mapping code is already aware of this and treats dev->bus_dma_mask as a limit. This quirk is already used by some architectures although still rare. With the introduction of the Raspberry Pi 4 we've found a new contender for the use of bus DMA limits, as its PCIe bus can only address the lower 3GB of memory (of a total of 4GB). This is impossible to represent with a mask. To make things worse the device-tree code rounds non power of two bus DMA limits to the next power of two, which is unacceptable in this case. In the light of this, rename dev->bus_dma_mask to dev->bus_dma_limit all over the tree and treat it as such. Note that dev->bus_dma_limit should contain the higher accessible DMA address. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- arch/mips/pci/fixup-sb1250.c | 16 ++++++++-------- arch/powerpc/sysdev/fsl_pci.c | 6 +++--- arch/x86/kernel/pci-dma.c | 2 +- arch/x86/mm/mem_encrypt.c | 2 +- arch/x86/pci/sta2x11-fixup.c | 2 +- drivers/acpi/arm64/iort.c | 20 +++++++------------- drivers/ata/ahci.c | 2 +- drivers/iommu/dma-iommu.c | 3 +-- drivers/of/device.c | 9 +++++---- include/linux/device.h | 6 +++--- include/linux/dma-direct.h | 2 +- include/linux/dma-mapping.h | 2 +- kernel/dma/direct.c | 27 +++++++++++++-------------- 13 files changed, 46 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/arch/mips/pci/fixup-sb1250.c b/arch/mips/pci/fixup-sb1250.c index 8a41b359cf90..40efc990cdce 100644 --- a/arch/mips/pci/fixup-sb1250.c +++ b/arch/mips/pci/fixup-sb1250.c @@ -21,22 +21,22 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI, /* * The BCM1250, etc. PCI host bridge does not support DAC on its 32-bit - * bus, so we set the bus's DMA mask accordingly. However the HT link + * bus, so we set the bus's DMA limit accordingly. However the HT link * down the artificial PCI-HT bridge supports 40-bit addressing and the * SP1011 HT-PCI bridge downstream supports both DAC and a 64-bit bus * width, so we record the PCI-HT bridge's secondary and subordinate bus - * numbers and do not set the mask for devices present in the inclusive + * numbers and do not set the limit for devices present in the inclusive * range of those. */ -struct sb1250_bus_dma_mask_exclude { +struct sb1250_bus_dma_limit_exclude { bool set; unsigned char start; unsigned char end; }; -static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data) +static int sb1250_bus_dma_limit(struct pci_dev *dev, void *data) { - struct sb1250_bus_dma_mask_exclude *exclude = data; + struct sb1250_bus_dma_limit_exclude *exclude = data; bool exclude_this; bool ht_bridge; @@ -55,7 +55,7 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data) exclude->start, exclude->end); } else { dev_dbg(&dev->dev, "disabling DAC for device"); - dev->dev.bus_dma_mask = DMA_BIT_MASK(32); + dev->dev.bus_dma_limit = DMA_BIT_MASK(32); } return 0; @@ -63,9 +63,9 @@ static int sb1250_bus_dma_mask(struct pci_dev *dev, void *data) static void quirk_sb1250_pci_dac(struct pci_dev *dev) { - struct sb1250_bus_dma_mask_exclude exclude = { .set = false }; + struct sb1250_bus_dma_limit_exclude exclude = { .set = false }; - pci_walk_bus(dev->bus, sb1250_bus_dma_mask, &exclude); + pci_walk_bus(dev->bus, sb1250_bus_dma_limit, &exclude); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SIBYTE, PCI_DEVICE_ID_BCM1250_PCI, quirk_sb1250_pci_dac); diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index ff0e2b156cb5..617a443d673d 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -115,8 +115,8 @@ static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); - pdev->dev.bus_dma_mask = - hose->dma_window_base_cur + hose->dma_window_size; + pdev->dev.bus_dma_limit = + hose->dma_window_base_cur + hose->dma_window_size - 1; } static void setup_swiotlb_ops(struct pci_controller *hose) @@ -135,7 +135,7 @@ static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) * mapping that allows addressing any RAM address from across PCI. */ if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { - dev->bus_dma_mask = 0; + dev->bus_dma_limit = 0; dev->archdata.dma_offset = pci64_dma_offset; } } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fa4352dce491..3a75d665d43c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -146,7 +146,7 @@ rootfs_initcall(pci_iommu_init); static int via_no_dac_cb(struct pci_dev *pdev, void *data) { - pdev->dev.bus_dma_mask = DMA_BIT_MASK(32); + pdev->dev.bus_dma_limit = DMA_BIT_MASK(32); return 0; } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 9268c12458c8..a03614bd3e1a 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -367,7 +367,7 @@ bool force_dma_unencrypted(struct device *dev) if (sme_active()) { u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask)); u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask, - dev->bus_dma_mask); + dev->bus_dma_limit); if (dma_dev_mask <= dma_enc_mask) return true; diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 4a631264b809..c313d784efab 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -143,7 +143,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev) dev->dma_pfn_offset = PFN_DOWN(-amba_base); - dev->bus_dma_mask = max_amba_addr; + dev->bus_dma_limit = max_amba_addr; pci_set_consistent_dma_mask(pdev, max_amba_addr); pci_set_dma_mask(pdev, max_amba_addr); diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 5a7551d060f2..33f71983e001 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1057,8 +1057,8 @@ static int rc_dma_get_range(struct device *dev, u64 *size) */ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { - u64 mask, dmaaddr = 0, size = 0, offset = 0; - int ret, msb; + u64 end, mask, dmaaddr = 0, size = 0, offset = 0; + int ret; /* * If @dev is expected to be DMA-capable then the bus code that created @@ -1085,19 +1085,13 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) } if (!ret) { - msb = fls64(dmaaddr + size - 1); /* - * Round-up to the power-of-two mask or set - * the mask to the whole 64-bit address space - * in case the DMA region covers the full - * memory window. + * Limit coherent and dma mask based on size retrieved from + * firmware. */ - mask = msb == 64 ? U64_MAX : (1ULL << msb) - 1; - /* - * Limit coherent and dma mask based on size - * retrieved from firmware. - */ - dev->bus_dma_mask = mask; + end = dmaaddr + size - 1; + mask = DMA_BIT_MASK(ilog2(end) + 1); + dev->bus_dma_limit = end; dev->coherent_dma_mask = mask; *dev->dma_mask = mask; } diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 05c2b32dcc4d..7c6d06ffb586 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -897,7 +897,7 @@ static int ahci_configure_dma_masks(struct pci_dev *pdev, int using_dac) * value, don't extend it here. This happens on STA2X11, for example. * * XXX: manipulating the DMA mask from platform code is completely - * bogus, platform code should use dev->bus_dma_mask instead.. + * bogus, platform code should use dev->bus_dma_limit instead.. */ if (pdev->dma_mask && pdev->dma_mask < DMA_BIT_MASK(32)) return 0; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 0fa8c1d818b7..646332fbf3d7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -405,8 +405,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1))) iova_len = roundup_pow_of_two(iova_len); - if (dev->bus_dma_mask) - dma_limit &= dev->bus_dma_mask; + dma_limit = min_not_zero(dma_limit, dev->bus_dma_limit); if (domain->geometry.force_aperture) dma_limit = min(dma_limit, domain->geometry.aperture_end); diff --git a/drivers/of/device.c b/drivers/of/device.c index da8158392010..e9127db7b067 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -93,7 +93,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) bool coherent; unsigned long offset; const struct iommu_ops *iommu; - u64 mask; + u64 mask, end; ret = of_dma_get_range(np, &dma_addr, &paddr, &size); if (ret < 0) { @@ -148,12 +148,13 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) * Limit coherent and dma mask based on size and default mask * set by the driver. */ - mask = DMA_BIT_MASK(ilog2(dma_addr + size - 1) + 1); + end = dma_addr + size - 1; + mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; - /* ...but only set bus mask if we found valid dma-ranges earlier */ + /* ...but only set bus limit if we found valid dma-ranges earlier */ if (!ret) - dev->bus_dma_mask = mask; + dev->bus_dma_limit = end; coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", diff --git a/include/linux/device.h b/include/linux/device.h index 297239a08bb7..e396de656f20 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1186,8 +1186,8 @@ struct dev_links_info { * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all * hardware supports 64-bit addresses for consistent allocations * such descriptors. - * @bus_dma_mask: Mask of an upstream bridge or bus which imposes a smaller DMA - * limit than the device itself supports. + * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller + * DMA limit than the device itself supports. * @dma_pfn_offset: offset of DMA memory range relatively of RAM * @dma_parms: A low level driver may set these to teach IOMMU code about * segment limitations. @@ -1270,7 +1270,7 @@ struct device { not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ - u64 bus_dma_mask; /* upstream dma_mask constraint */ + u64 bus_dma_limit; /* upstream dma constraint */ unsigned long dma_pfn_offset; struct device_dma_parameters *dma_parms; diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 452f5280cde3..24b8684aa21d 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -63,7 +63,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size, min(addr, end) < phys_to_dma(dev, PFN_PHYS(min_low_pfn))) return false; - return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_mask); + return end <= min_not_zero(*dev->dma_mask, dev->bus_dma_limit); } u64 dma_direct_get_required_mask(struct device *dev); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4d450672b7d6..c4d8741264bd 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -697,7 +697,7 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) */ static inline bool dma_addressing_limited(struct device *dev) { - return min_not_zero(dma_get_mask(dev), dev->bus_dma_mask) < + return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < dma_get_required_mask(dev); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 267b23a13b69..6af7ae83c4ad 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -27,10 +27,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { if (!dev->dma_mask) { dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { + } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus mask %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask); + "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); } WARN_ON_ONCE(1); } @@ -57,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev) } static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_mask) + u64 *phys_limit) { - if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) - dma_mask = dev->bus_dma_mask; + u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); if (force_dma_unencrypted(dev)) - *phys_mask = __dma_to_phys(dev, dma_mask); + *phys_limit = __dma_to_phys(dev, dma_limit); else - *phys_mask = dma_to_phys(dev, dma_mask); + *phys_limit = dma_to_phys(dev, dma_limit); /* * Optimistically try the zone that the physical address mask falls @@ -75,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ - if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits)) + if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; - if (*phys_mask <= DMA_BIT_MASK(32)) + if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; return 0; } @@ -85,7 +84,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= - min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, @@ -94,7 +93,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); struct page *page = NULL; - u64 phys_mask; + u64 phys_limit; if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; @@ -102,7 +101,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -116,7 +115,7 @@ again: page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - phys_mask < DMA_BIT_MASK(64) && + phys_limit < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; -- cgit v1.2.3-59-g8ed1b From 0e4a459f56c32d3e52ae69a4b447db2f48a65f44 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Wed, 20 Nov 2019 19:43:50 +0900 Subject: tracing: Remove unnecessary DEBUG_FS dependency Tracing replaced debugfs with tracefs. Signed-off-by: Kusanagi Kouichi Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20191120104350753.EWCT.12796.ppp.dion.ne.jp@dmta0009.auone-net.jp Signed-off-by: Greg Kroah-Hartman --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..382628b9b759 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -106,7 +106,6 @@ config PREEMPTIRQ_TRACEPOINTS config TRACING bool - select DEBUG_FS select RING_BUFFER select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS -- cgit v1.2.3-59-g8ed1b From a82a4804b4ee3636c8988fea14d44f70f4de45f1 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 16 Nov 2019 10:05:55 -0500 Subject: ring-buffer: Fix typos in function ring_buffer_producer Fix spelling and other typos Link: http://lkml.kernel.org/r/1573916755-32478-1-git-send-email-xianting_tian@126.com Signed-off-by: Xianting Tian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 09b0b49f346e..32149e46551c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -269,10 +269,10 @@ static void ring_buffer_producer(void) #ifndef CONFIG_PREEMPTION /* - * If we are a non preempt kernel, the 10 second run will + * If we are a non preempt kernel, the 10 seconds run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a - * rescedule. + * reschedule. * * Do a cond resched at the same frequency we would wake up * the reader. -- cgit v1.2.3-59-g8ed1b From fc809bc5ceaa665497384064ab2d76713c774bad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:38:07 +0800 Subject: tracing: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Link: http://lkml.kernel.org/r/20191120133807.12741-1-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b872716bb2a0..f67620499faa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,7 +79,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -311,7 +311,7 @@ config TRACER_SNAPSHOT cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP - bool "Allow snapshot to swap per CPU" + bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT select RING_BUFFER_ALLOW_SWAP help @@ -683,7 +683,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. config TRACEPOINT_BENCHMARK - bool "Add tracepoint that benchmarks tracepoints" + bool "Add tracepoint that benchmarks tracepoints" help This option creates the tracepoint "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that @@ -732,7 +732,7 @@ config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER help - Run a simple self test on the ring buffer on boot up. Late in the + Run a simple self test on the ring buffer on boot up. Late in the kernel boot sequence, the test will start that kicks off a thread per cpu. Each thread will write various size events into the ring buffer. Another thread is created to send IPIs -- cgit v1.2.3-59-g8ed1b From 28879787147358e8ffcae397f11748de3dd26577 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Nov 2019 11:08:38 -0800 Subject: tracing: Adding new functions for kernel access to Ftrace instances Adding 2 new functions - 1) struct trace_array *trace_array_get_by_name(const char *name); Return pointer to a trace array with given name. If it does not exist, create and return pointer to the new trace array. 2) int trace_array_set_clr_event(struct trace_array *tr, const char *system ,const char *event, bool enable); Enable/Disable events to this trace array. Additionally, - To handle reference counters, export trace_array_put() - Due to introduction of the above 2 new functions, we no longer need to export - ftrace_set_clr_event & trace_array_create APIs. Link: http://lkml.kernel.org/r/1574276919-11119-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Aruna Ramakrishna Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 3 +- include/linux/trace_events.h | 3 +- kernel/trace/trace.c | 96 +++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 1 - kernel/trace/trace_events.c | 27 ++++++++++++- 5 files changed, 106 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace.h b/include/linux/trace.h index 24fcf07812ae..7fd86d3c691f 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -29,7 +29,8 @@ struct trace_array; void trace_printk_init_buffers(void); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); -struct trace_array *trace_array_create(const char *name); +void trace_array_put(struct trace_array *tr); +struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); #endif /* CONFIG_TRACING */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 60a41b7069dd..4c6e15605766 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -555,7 +555,8 @@ extern int trace_event_get_offsets(struct trace_event_call *call); int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); - +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42659ce6ac0c..02a23a6e5e00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -301,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr) this_tr->ref--; } +/** + * trace_array_put - Decrement the reference counter for this trace array. + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ void trace_array_put(struct trace_array *this_tr) { + if (!this_tr) + return; + mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } +EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { @@ -8437,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -struct trace_array *trace_array_create(const char *name) +static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -EEXIST; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; - } - ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) - goto out_unlock; + return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) @@ -8499,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name) list_add(&tr->list, &ftrace_trace_arrays); - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + tr->ref++; + return tr; @@ -8510,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name) kfree(tr->name); kfree(tr); - out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(trace_array_create); static int instance_mkdir(const char *name) { - return PTR_ERR_OR_ZERO(trace_array_create(name)); + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); + +out_unlock: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return ret; +} + +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * + */ +struct trace_array *trace_array_get_by_name(const char *name) +{ + struct trace_array *tr; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + if (IS_ERR(tr)) + tr = NULL; +out_unlock: + if (tr) + tr->ref++; + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return tr; } +EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; - if (tr->ref || (tr->current_trace && tr->current_trace->ref)) + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2df8aed6a8f0..ca7fccafbcbb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,7 +345,6 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); -extern void trace_array_put(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2a3ac2365445..6b3a69e9aa6a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -827,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } -EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -852,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set) } EXPORT_SYMBOL_GPL(trace_set_clr_event); +/** + * trace_array_set_clr_event - enable or disable an event for a trace array. + * @tr: concerned trace array. + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @enable: true to enable, false to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable) +{ + int set; + + if (!tr) + return -ENOENT; + + set = (enable == true) ? 1 : 0; + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_array_set_clr_event); + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.3-59-g8ed1b From 0e24220821b0e0e330a18bfef29ac6396545d62e Mon Sep 17 00:00:00 2001 From: Hassan Naveed Date: Fri, 15 Nov 2019 23:44:42 +0000 Subject: tracing: Use xarray for syscall trace events Currently, a lot of memory is wasted for architectures like MIPS when init_ftrace_syscalls() allocates the array for syscalls using kcalloc. This is because syscalls numbers start from 4000, 5000 or 6000 and array elements up to that point are unused. Fix this by using a data structure more suited to storing sparsely populated arrays. The XARRAY data structure, implemented using radix trees, is much more memory efficient for storing the syscalls in question. Link: http://lkml.kernel.org/r/20191115234314.21599-1-hnaveed@wavecomp.com Signed-off-by: Hassan Naveed Reviewed-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 8 ++++++++ kernel/trace/trace_syscalls.c | 32 +++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 5f8a5d84dbbe..69c87e8608d8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -960,6 +960,14 @@ config RELR config ARCH_HAS_MEM_ENCRYPT bool +config HAVE_SPARSE_SYSCALL_NR + bool + help + An architecture should select this if its syscall numbering is sparse + to save space. For example, MIPS architecture has a syscall array with + entries at 4000, 5000 and 6000 locations. This option turns on syscall + related optimizations for a given architecture. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..16fa218556fa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -7,6 +7,7 @@ #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include +#include #include #include "trace_output.h" @@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call) extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; +static DEFINE_XARRAY(syscalls_metadata_sparse); static struct syscall_metadata **syscalls_metadata; #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME @@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall) static struct syscall_metadata *syscall_nr_to_meta(int nr) { + if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) + return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; @@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void) struct syscall_metadata *meta; unsigned long addr; int i; - - syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), - GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return; + void *ret; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata = kcalloc(NR_syscalls, + sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } } for (i = 0; i < NR_syscalls; i++) { @@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void) continue; meta->syscall_nr = i; - syscalls_metadata[i] = meta; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata[i] = meta; + } else { + ret = xa_store(&syscalls_metadata_sparse, i, meta, + GFP_KERNEL); + WARN(xa_is_err(ret), + "Syscall memory allocation failed\n"); + } + } } -- cgit v1.2.3-59-g8ed1b From 107e899874e95dcddc779142942bf285eba38bc5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Nov 2019 16:22:21 -0400 Subject: mm/hmm: define the pre-processor related parts of hmm.h even if disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the function calls are stubbed out with static inlines that always fail. This is the standard way to write a header for an optional component and makes it easier for drivers that only optionally need HMM_MIRROR. Link: https://lore.kernel.org/r/20191112202231.3856-5-jgg@ziepe.ca Reviewed-by: Jérôme Glisse Tested-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- include/linux/hmm.h | 59 ++++++++++++++++++++++++++++++++++++++++++----------- kernel/fork.c | 1 - 2 files changed, 47 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index fbb35c78637e..cb69bf10dc78 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -62,8 +62,6 @@ #include #include -#ifdef CONFIG_HMM_MIRROR - #include #include #include @@ -374,6 +372,15 @@ struct hmm_mirror { struct list_head list; }; +/* + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. + */ +#define HMM_FAULT_ALLOW_RETRY (1 << 0) + +/* Don't fault in missing PTEs, just snapshot the current state. */ +#define HMM_FAULT_SNAPSHOT (1 << 1) + +#ifdef CONFIG_HMM_MIRROR int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); @@ -383,14 +390,6 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror); void hmm_range_unregister(struct hmm_range *range); -/* - * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. - */ -#define HMM_FAULT_ALLOW_RETRY (1 << 0) - -/* Don't fault in missing PTEs, just snapshot the current state. */ -#define HMM_FAULT_SNAPSHOT (1 << 1) - long hmm_range_fault(struct hmm_range *range, unsigned int flags); long hmm_range_dma_map(struct hmm_range *range, @@ -401,6 +400,44 @@ long hmm_range_dma_unmap(struct hmm_range *range, struct device *device, dma_addr_t *daddrs, bool dirty); +#else +int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) +{ + return -EOPNOTSUPP; +} + +void hmm_mirror_unregister(struct hmm_mirror *mirror) +{ +} + +int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) +{ + return -EOPNOTSUPP; +} + +void hmm_range_unregister(struct hmm_range *range) +{ +} + +static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags) +{ + return -EOPNOTSUPP; +} + +static inline long hmm_range_dma_map(struct hmm_range *range, + struct device *device, dma_addr_t *daddrs, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + +static inline long hmm_range_dma_unmap(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, bool dirty) +{ + return -EOPNOTSUPP; +} +#endif /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range @@ -411,6 +448,4 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - #endif /* LINUX_HMM_H */ diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..ca39cfc404e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From b111df8447acdeb4b9220f99d5d4b28f83eb56ad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Nov 2019 21:25:46 +0100 Subject: y2038: alarm: fix half-second cut-off Changing alarm_itimer accidentally broke the logic for arithmetic rounding of half seconds in the return code. Change it to a constant based on NSEC_PER_SEC, as suggested by Ben Hutchings. Fixes: bd40a175769d ("y2038: itimer: change implementation to timespec64") Reported-by: Ben Hutchings Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 5872db9bd5f7..9e59c9ea92aa 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -297,7 +297,7 @@ static unsigned int alarm_setitimer(unsigned int seconds) * better return too much than too little anyway */ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || - it_old.it_value.tv_nsec >= 500000) + it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2)) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; -- cgit v1.2.3-59-g8ed1b From 61a47c1ad3a4dc6882f01ebdc88138ac62d0df03 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 1 Oct 2019 13:01:19 -0500 Subject: sysctl: Remove the sysctl system call This system call has been deprecated almost since it was introduced, and in a survey of the linux distributions I can no longer find any of them that enable CONFIG_SYSCTL_SYSCALL. The only indication that I can find that anyone might care is that a few of the defconfigs in the kernel enable CONFIG_SYSCTL_SYSCALL. However this appears in only 31 of 414 defconfigs in the kernel, so I suspect this symbols presence is simply because it is harmless to include rather than because it is necessary. As there appear to be no users of the sysctl system call, remove the code. As this removes one of the few uses of the internal kernel mount of proc I hope this allows for even more simplifications of the proc filesystem. Cc: Alex Smith Cc: Anders Berg Cc: Apelete Seketeli Cc: Arnd Bergmann Cc: Chee Nouk Phoon Cc: Chris Zankel Cc: Christian Ruppert Cc: Greg Ungerer Cc: Harvey Hunt Cc: Helge Deller Cc: Hongliang Tao Cc: Hua Yan Cc: Huacai Chen Cc: John Crispin Cc: Jonas Jensen Cc: Josh Boyer Cc: Jun Nie Cc: Kevin Hilman Cc: Kevin Wells Cc: Kumar Gala Cc: Lars-Peter Clausen Cc: Ley Foon Tan Cc: Linus Walleij Cc: Markos Chandras Cc: Max Filippov Cc: Noam Camus Cc: Olof Johansson Cc: Paul Burton Cc: Paul Mundt Cc: Phil Edworthy Cc: Pierrick Hascoet Cc: Ralf Baechle Cc: Roland Stigge Cc: Santosh Shilimkar Cc: Scott Telford Cc: Stephen Boyd Cc: Steven J. Hill Cc: Tanmay Inamdar Cc: Vineet Gupta Cc: Wolfram Sang Acked-by: Andi Kleen Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- arch/arc/configs/nps_defconfig | 1 - arch/arc/configs/tb10x_defconfig | 1 - arch/arm/configs/axm55xx_defconfig | 1 - arch/arm/configs/keystone_defconfig | 1 - arch/arm/configs/lpc32xx_defconfig | 1 - arch/arm/configs/moxart_defconfig | 1 - arch/arm/configs/qcom_defconfig | 1 - arch/arm/configs/zx_defconfig | 1 - arch/m68k/configs/m5475evb_defconfig | 1 - arch/mips/configs/ci20_defconfig | 1 - arch/mips/configs/loongson3_defconfig | 1 - arch/mips/configs/malta_qemu_32r6_defconfig | 1 - arch/mips/configs/maltaaprp_defconfig | 1 - arch/mips/configs/maltasmvp_defconfig | 1 - arch/mips/configs/maltasmvp_eva_defconfig | 1 - arch/mips/configs/maltaup_defconfig | 1 - arch/mips/configs/omega2p_defconfig | 1 - arch/mips/configs/qi_lb60_defconfig | 1 - arch/mips/configs/vocore2_defconfig | 1 - arch/nios2/configs/10m50_defconfig | 1 - arch/nios2/configs/3c120_defconfig | 1 - arch/parisc/configs/c8000_defconfig | 1 - arch/parisc/configs/generic-32bit_defconfig | 1 - arch/powerpc/configs/40x/klondike_defconfig | 1 - arch/sh/configs/rsk7264_defconfig | 1 - arch/xtensa/configs/audio_kc705_defconfig | 1 - arch/xtensa/configs/cadence_csp_defconfig | 1 - arch/xtensa/configs/generic_kc705_defconfig | 1 - arch/xtensa/configs/iss_defconfig | 1 - arch/xtensa/configs/nommu_kc705_defconfig | 1 - arch/xtensa/configs/smp_lx200_defconfig | 1 - arch/xtensa/configs/virt_defconfig | 1 - init/Kconfig | 17 - kernel/sysctl_binary.c | 1305 --------------------------- 34 files changed, 1354 deletions(-) (limited to 'kernel') diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig index 5978d4d7d5b0..07f26ed39f02 100644 --- a/arch/arc/configs/nps_defconfig +++ b/arch/arc/configs/nps_defconfig @@ -7,7 +7,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y -CONFIG_SYSCTL_SYSCALL=y # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set # CONFIG_TIMERFD is not set diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 3a138f8c7299..a12656ec0072 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -15,7 +15,6 @@ CONFIG_INITRAMFS_ROOT_UID=2100 CONFIG_INITRAMFS_ROOT_GID=501 # CONFIG_RD_GZIP is not set CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arm/configs/axm55xx_defconfig b/arch/arm/configs/axm55xx_defconfig index 31bfe1647d28..f53634af014b 100644 --- a/arch/arm/configs/axm55xx_defconfig +++ b/arch/arm/configs/axm55xx_defconfig @@ -20,7 +20,6 @@ CONFIG_NAMESPACES=y CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig index 3d5f5b501330..f33f5d76365f 100644 --- a/arch/arm/configs/keystone_defconfig +++ b/arch/arm/configs/keystone_defconfig @@ -11,7 +11,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_SCHED=y CONFIG_BLK_CGROUP=y CONFIG_BLK_DEV_INITRD=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y # CONFIG_ELF_CORE is not set # CONFIG_BASE_FULL is not set diff --git a/arch/arm/configs/lpc32xx_defconfig b/arch/arm/configs/lpc32xx_defconfig index 09deb57db942..989bcc84e7fb 100644 --- a/arch/arm/configs/lpc32xx_defconfig +++ b/arch/arm/configs/lpc32xx_defconfig @@ -9,7 +9,6 @@ CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y # CONFIG_ARCH_MULTI_V7 is not set diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig index 9b98761e51c9..45d27190c9c9 100644 --- a/arch/arm/configs/moxart_defconfig +++ b/arch/arm/configs/moxart_defconfig @@ -4,7 +4,6 @@ CONFIG_SYSVIPC=y CONFIG_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y -CONFIG_SYSCTL_SYSCALL=y # CONFIG_ELF_CORE is not set # CONFIG_BASE_FULL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig index 02f1e7b7c8f6..67c306fff376 100644 --- a/arch/arm/configs/qcom_defconfig +++ b/arch/arm/configs/qcom_defconfig @@ -5,7 +5,6 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_CGROUPS=y CONFIG_BLK_DEV_INITRD=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_SLUB_DEBUG is not set diff --git a/arch/arm/configs/zx_defconfig b/arch/arm/configs/zx_defconfig index c4070c19ea6c..4d2ef785ed34 100644 --- a/arch/arm/configs/zx_defconfig +++ b/arch/arm/configs/zx_defconfig @@ -11,7 +11,6 @@ CONFIG_RT_GROUP_SCHED=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/m68k/configs/m5475evb_defconfig b/arch/m68k/configs/m5475evb_defconfig index 434bd3750966..579fd98afed6 100644 --- a/arch/m68k/configs/m5475evb_defconfig +++ b/arch/m68k/configs/m5475evb_defconfig @@ -1,6 +1,5 @@ # CONFIG_SWAP is not set CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSCTL_SYSCALL=y # CONFIG_KALLSYMS is not set # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index cb4aa23a2bf4..be41df2a81fb 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -17,7 +17,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig index 90ee0084d786..409dc9a43f58 100644 --- a/arch/mips/configs/loongson3_defconfig +++ b/arch/mips/configs/loongson3_defconfig @@ -21,7 +21,6 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_MACH_LOONGSON64=y CONFIG_LOONGSON_MACH3X=y diff --git a/arch/mips/configs/malta_qemu_32r6_defconfig b/arch/mips/configs/malta_qemu_32r6_defconfig index e6c600dc1814..614af02d83e6 100644 --- a/arch/mips/configs/malta_qemu_32r6_defconfig +++ b/arch/mips/configs/malta_qemu_32r6_defconfig @@ -5,7 +5,6 @@ CONFIG_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_MIPS_MALTA=y diff --git a/arch/mips/configs/maltaaprp_defconfig b/arch/mips/configs/maltaaprp_defconfig index 82b44b774553..9c051f8fd330 100644 --- a/arch/mips/configs/maltaaprp_defconfig +++ b/arch/mips/configs/maltaaprp_defconfig @@ -5,7 +5,6 @@ CONFIG_AUDIT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_MIPS_MALTA=y diff --git a/arch/mips/configs/maltasmvp_defconfig b/arch/mips/configs/maltasmvp_defconfig index 4190fc6189a0..2e90d97551d6 100644 --- a/arch/mips/configs/maltasmvp_defconfig +++ b/arch/mips/configs/maltasmvp_defconfig @@ -5,7 +5,6 @@ CONFIG_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_MIPS_MALTA=y diff --git a/arch/mips/configs/maltasmvp_eva_defconfig b/arch/mips/configs/maltasmvp_eva_defconfig index a13c10e910ec..d1f7fdb27284 100644 --- a/arch/mips/configs/maltasmvp_eva_defconfig +++ b/arch/mips/configs/maltasmvp_eva_defconfig @@ -5,7 +5,6 @@ CONFIG_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_MIPS_MALTA=y diff --git a/arch/mips/configs/maltaup_defconfig b/arch/mips/configs/maltaup_defconfig index b35f1fc690fb..48e5bd492452 100644 --- a/arch/mips/configs/maltaup_defconfig +++ b/arch/mips/configs/maltaup_defconfig @@ -6,7 +6,6 @@ CONFIG_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=15 -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_MIPS_MALTA=y diff --git a/arch/mips/configs/omega2p_defconfig b/arch/mips/configs/omega2p_defconfig index a39426e57e91..fc39ddf610a9 100644 --- a/arch/mips/configs/omega2p_defconfig +++ b/arch/mips/configs/omega2p_defconfig @@ -16,7 +16,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/mips/configs/qi_lb60_defconfig b/arch/mips/configs/qi_lb60_defconfig index d3f4d5248d9f..97c9a69d1528 100644 --- a/arch/mips/configs/qi_lb60_defconfig +++ b/arch/mips/configs/qi_lb60_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y # CONFIG_CROSS_MEMORY_ATTACH is not set CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/mips/configs/vocore2_defconfig b/arch/mips/configs/vocore2_defconfig index 523b944fd527..a14f8ea5c386 100644 --- a/arch/mips/configs/vocore2_defconfig +++ b/arch/mips/configs/vocore2_defconfig @@ -16,7 +16,6 @@ CONFIG_CGROUP_CPUACCT=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig index 1137ef2ed3b0..a7967b4cfb6e 100644 --- a/arch/nios2/configs/10m50_defconfig +++ b/arch/nios2/configs/10m50_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y CONFIG_NO_HZ_IDLE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSCTL_SYSCALL=y # CONFIG_ELF_CORE is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/nios2/configs/3c120_defconfig b/arch/nios2/configs/3c120_defconfig index a0f160ba7598..423a0c40a162 100644 --- a/arch/nios2/configs/3c120_defconfig +++ b/arch/nios2/configs/3c120_defconfig @@ -2,7 +2,6 @@ CONFIG_SYSVIPC=y CONFIG_NO_HZ_IDLE=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_LOG_BUF_SHIFT=14 -CONFIG_SYSCTL_SYSCALL=y # CONFIG_ELF_CORE is not set # CONFIG_EPOLL is not set # CONFIG_SIGNALFD is not set diff --git a/arch/parisc/configs/c8000_defconfig b/arch/parisc/configs/c8000_defconfig index 507f0644fcf8..db864b18962a 100644 --- a/arch/parisc/configs/c8000_defconfig +++ b/arch/parisc/configs/c8000_defconfig @@ -9,7 +9,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index 18b072a47a10..c7a5726728a4 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -8,7 +8,6 @@ CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_PERF_EVENTS=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/powerpc/configs/40x/klondike_defconfig b/arch/powerpc/configs/40x/klondike_defconfig index 4347a87088dc..579fa846839c 100644 --- a/arch/powerpc/configs/40x/klondike_defconfig +++ b/arch/powerpc/configs/40x/klondike_defconfig @@ -4,7 +4,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_BLK_DEV_INITRD=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y CONFIG_MODULES=y diff --git a/arch/sh/configs/rsk7264_defconfig b/arch/sh/configs/rsk7264_defconfig index 2b0572b497c1..78643191c99e 100644 --- a/arch/sh/configs/rsk7264_defconfig +++ b/arch/sh/configs/rsk7264_defconfig @@ -8,7 +8,6 @@ CONFIG_NAMESPACES=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_COUNTERS=y diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig index f378e56f9ce6..b6367af71d65 100644 --- a/arch/xtensa/configs/audio_kc705_defconfig +++ b/arch/xtensa/configs/audio_kc705_defconfig @@ -16,7 +16,6 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig index 62f32a902568..f4eef6decd2a 100644 --- a/arch/xtensa/configs/cadence_csp_defconfig +++ b/arch/xtensa/configs/cadence_csp_defconfig @@ -21,7 +21,6 @@ CONFIG_INITRAMFS_SOURCE="$$KERNEL_INITRAMFS_SOURCE" # CONFIG_RD_LZO is not set # CONFIG_RD_LZ4 is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_PROFILING=y CONFIG_MODULES=y diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig index 8bebe07f1060..c925165cf760 100644 --- a/arch/xtensa/configs/generic_kc705_defconfig +++ b/arch/xtensa/configs/generic_kc705_defconfig @@ -16,7 +16,6 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index 4bb5b76d9524..d1c01742baf4 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set # CONFIG_PCI is not set diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index 933ab2adf434..380e366730d5 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -21,7 +21,6 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_RD_LZO is not set # CONFIG_RD_LZ4 is not set CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_PERF_EVENTS=y CONFIG_MODULES=y diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig index e29c5b179a5b..d46b58f34098 100644 --- a/arch/xtensa/configs/smp_lx200_defconfig +++ b/arch/xtensa/configs/smp_lx200_defconfig @@ -16,7 +16,6 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y CONFIG_OPROFILE=y diff --git a/arch/xtensa/configs/virt_defconfig b/arch/xtensa/configs/virt_defconfig index bfc45a138e72..4fddd8512350 100644 --- a/arch/xtensa/configs/virt_defconfig +++ b/arch/xtensa/configs/virt_defconfig @@ -15,7 +15,6 @@ CONFIG_SCHED_AUTOGROUP=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y -CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS_ALL=y CONFIG_PERF_EVENTS=y CONFIG_XTENSA_VARIANT_DC233C=y diff --git a/init/Kconfig b/init/Kconfig index b4daad2bac23..a408116c7719 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1372,23 +1372,6 @@ config SYSFS_SYSCALL If unsure say Y here. -config SYSCTL_SYSCALL - bool "Sysctl syscall support" if EXPERT - depends on PROC_SYSCTL - default n - select SYSCTL - ---help--- - sys_sysctl uses binary paths that have been found challenging - to properly maintain and use. The interface in /proc/sys - using paths with ascii names is now the primary path to this - information. - - Almost nothing using the binary sysctl interface so if you are - trying to save some space it is probably safe to disable this, - making your kernel marginally smaller. - - If unsure say N here. - config FHANDLE bool "open by fhandle syscalls" if EXPERT select EXPORTFS diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 73c132095a7b..7d550cc76a3b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -18,1317 +18,12 @@ #include #include -#ifdef CONFIG_SYSCTL_SYSCALL - -struct bin_table; -typedef ssize_t bin_convert_t(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); - -static bin_convert_t bin_dir; -static bin_convert_t bin_string; -static bin_convert_t bin_intvec; -static bin_convert_t bin_ulongvec; -static bin_convert_t bin_uuid; -static bin_convert_t bin_dn_node_address; - -#define CTL_DIR bin_dir -#define CTL_STR bin_string -#define CTL_INT bin_intvec -#define CTL_ULONG bin_ulongvec -#define CTL_UUID bin_uuid -#define CTL_DNADR bin_dn_node_address - -#define BUFSZ 256 - -struct bin_table { - bin_convert_t *convert; - int ctl_name; - const char *procname; - const struct bin_table *child; -}; - -static const struct bin_table bin_random_table[] = { - { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, - { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, - { CTL_UUID, RANDOM_UUID, "uuid" }, - {} -}; - -static const struct bin_table bin_pty_table[] = { - { CTL_INT, PTY_MAX, "max" }, - { CTL_INT, PTY_NR, "nr" }, - {} -}; - -static const struct bin_table bin_kern_table[] = { - { CTL_STR, KERN_OSTYPE, "ostype" }, - { CTL_STR, KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { CTL_STR, KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { CTL_STR, KERN_NODENAME, "hostname" }, - { CTL_STR, KERN_DOMAINNAME, "domainname" }, - - { CTL_INT, KERN_PANIC, "panic" }, - { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, - - { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, - { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, - { CTL_INT, KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { CTL_STR, KERN_MODPROBE, "modprobe" }, - { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, - { CTL_INT, KERN_ACCT, "acct" }, - /* KERN_PPC_L2CR "l2cr" no longer used */ - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { CTL_ULONG, KERN_SHMMAX, "shmmax" }, - { CTL_INT, KERN_MSGMAX, "msgmax" }, - { CTL_INT, KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { CTL_INT, KERN_SYSRQ, "sysrq" }, - { CTL_INT, KERN_MAX_THREADS, "threads-max" }, - { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, - { CTL_ULONG, KERN_SHMALL, "shmall" }, - { CTL_INT, KERN_MSGMNI, "msgmni" }, - { CTL_INT, KERN_SEM, "sem" }, - { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, - { CTL_INT, KERN_SHMMNI, "shmmni" }, - - { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, - { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, - - { CTL_STR, KERN_HOTPLUG, "hotplug", }, - { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, - /* KERN_TAINTED "tainted" no longer used */ - { CTL_INT, KERN_CADPID, "cad_pid" }, - { CTL_INT, KERN_PIDMAX, "pid_max" }, - { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, - { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, - { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, - { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, - { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - /* KERN_HZ_TIMER "hz_timer" no longer used */ - { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, - - { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, - /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ - { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, - { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, - { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" }, - {} -}; - -static const struct bin_table bin_vm_table[] = { - { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, - { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, - /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ - /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ - { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ - { CTL_INT, VM_SWAPPINESS, "swappiness" }, - { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, - { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, - { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, - { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, - { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, - { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, - { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct bin_table bin_net_core_table[] = { - { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, - { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, - { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, - { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, - { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, - { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, - { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, - { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { CTL_INT, NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct bin_table bin_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct bin_table bin_net_ipv4_route_table[] = { - { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, - /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ - /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_vars_table[] = { - { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, - { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - - { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, - {} -}; - -static const struct bin_table bin_net_neigh_vars_table[] = { - { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, - /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ - { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ - /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ - /* NET_NEIGH_LOCKTIME "locktime" no longer used */ - { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct bin_table bin_net_neigh_table[] = { - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, - { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, - {} -}; - -static const struct bin_table bin_net_ipv4_netfilter_table[] = { - { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ - - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ - /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct bin_table bin_net_ipv4_table[] = { - {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, - - { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, - { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, - - { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, - { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, - { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, - { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { CTL_INT, NET_TCP_FACK, "tcp_fack" }, - { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, - { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, - { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, - { CTL_INT, NET_TCP_MEM, "tcp_mem" }, - { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, - { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, - { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, - { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, - { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, - { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ - { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - - { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - - { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ - - { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - /* NET_TCP_BIC_BETA unused */ - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - /* NET_IPV4_IP_MASQ_DEBUG unused */ - /* NET_TCP_SYN_TAILDROP unused */ - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - /* NET_IPV4_ALWAYS_DEFRAG unused */ - {} -}; - -static const struct bin_table bin_net_ipx_table[] = { - { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct bin_table bin_net_atalk_table[] = { - { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct bin_table bin_net_netrom_table[] = { - { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { CTL_INT, NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct bin_table bin_net_ax25_param_table[] = { - { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, - { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, - { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, - { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, - { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct bin_table bin_net_ax25_table[] = { - { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, - {} -}; - -static const struct bin_table bin_net_rose_table[] = { - { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, - { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_var_table[] = { - { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, - { CTL_INT, NET_IPV6_MTU, "mtu" }, - { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, - { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, - { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, - { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, - {} -}; - -static const struct bin_table bin_net_ipv6_route_table[] = { - /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ - { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct bin_table bin_net_ipv6_icmp_table[] = { - { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct bin_table bin_net_ipv6_table[] = { - { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, - { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, - { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, - { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, - { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct bin_table bin_net_x25_table[] = { - { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { CTL_INT, NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct bin_table bin_net_tr_table[] = { - { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct bin_table bin_net_decnet_conf_vars[] = { - { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, - { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct bin_table bin_net_decnet_conf[] = { - { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, - { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, - {} -}; - -static const struct bin_table bin_net_decnet_table[] = { - { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, - { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, - { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, - { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, - { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, - { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, - { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, - { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, - { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, - { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, - { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct bin_table bin_net_sctp_table[] = { - { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, - { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, - { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, - { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, - { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, - { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_timeout_table[] = { - { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, - { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, - { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, - { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct bin_table bin_net_llc_station_table[] = { - { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_table[] = { - { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, - {} -}; - -static const struct bin_table bin_net_llc_table[] = { - { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, - { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, - {} -}; - -static const struct bin_table bin_net_netfilter_table[] = { - { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ - /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ - /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ - /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct bin_table bin_net_table[] = { - { CTL_DIR, NET_CORE, "core", bin_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, - { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, - { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, - { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, - { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, - { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, - /* NET_BRIDGE "bridge" no longer used */ - { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, - { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, - { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, - { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, - { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, - /* NET_ECONET not used */ - { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, - { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, - { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, - /* NET_DCCP "dccp" no longer used */ - /* NET_IRDA "irda" no longer used */ - { CTL_INT, 2089, "nf_conntrack_max" }, - {} -}; - -static const struct bin_table bin_fs_quota_table[] = { - { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, - { CTL_INT, FS_DQ_DROPS, "drops" }, - { CTL_INT, FS_DQ_READS, "reads" }, - { CTL_INT, FS_DQ_WRITES, "writes" }, - { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, - { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, - { CTL_INT, FS_DQ_FREE, "free_dquots" }, - { CTL_INT, FS_DQ_SYNCS, "syncs" }, - { CTL_INT, FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct bin_table bin_fs_xfs_table[] = { - { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, - - { CTL_INT, XFS_ERRLEVEL, "error_level" }, - { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, - { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, - { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, - { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, - { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, - { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_nm_table[] = { - { CTL_STR, 1, "hb_ctl_path" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_table[] = { - { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, - {} -}; - -static const struct bin_table bin_inotify_table[] = { - { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct bin_table bin_fs_table[] = { - { CTL_INT, FS_NRINODE, "inode-nr" }, - { CTL_INT, FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - /* FS_NRFILE "file-nr" no longer used */ - { CTL_INT, FS_MAXFILE, "file-max" }, - { CTL_INT, FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, - { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, - { CTL_INT, FS_LEASES, "leases-enable" }, - { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, - { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, - { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, - { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, - { CTL_ULONG, FS_AIO_NR, "aio-nr" }, - { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, - { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, - { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, - { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct bin_table bin_ipmi_table[] = { - { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct bin_table bin_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct bin_table bin_raid_table[] = { - { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct bin_table bin_scsi_table[] = { - { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct bin_table bin_dev_table[] = { - /* DEV_CDROM "cdrom" no longer used */ - /* DEV_HWMON unused */ - /* DEV_PARPORT "parport" no longer used */ - { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, - { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, - { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, - { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, - {} -}; - -static const struct bin_table bin_bus_isa_table[] = { - { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, - { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, - { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct bin_table bin_bus_table[] = { - { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, - {} -}; - - -static const struct bin_table bin_s390dbf_table[] = { - { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct bin_table bin_sunrpc_table[] = { - /* CTL_RPCDEBUG "rpc_debug" no longer used */ - /* CTL_NFSDEBUG "nfs_debug" no longer used */ - /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ - /* CTL_NLMDEBUG "nlm_debug" no longer used */ - - { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct bin_table bin_pm_table[] = { - /* frv specific */ - /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ - { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, - { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, - { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct bin_table bin_root_table[] = { - { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, - { CTL_DIR, CTL_VM, "vm", bin_vm_table }, - { CTL_DIR, CTL_NET, "net", bin_net_table }, - /* CTL_PROC not used */ - { CTL_DIR, CTL_FS, "fs", bin_fs_table }, - /* CTL_DEBUG "debug" no longer used */ - { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, - { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, - { CTL_DIR, CTL_ABI, "abi" }, - /* CTL_CPU not used */ - /* CTL_ARLAN "arlan" no longer used */ - { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, - { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, - { CTL_DIR, CTL_PM, "pm", bin_pm_table }, - {} -}; - -static ssize_t bin_dir(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOTDIR; -} - - -static ssize_t bin_string(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char __user *lastp; - loff_t pos = 0; - int ch; - - result = vfs_read(file, oldval, oldlen, &pos); - if (result < 0) - goto out; - - copied = result; - lastp = oldval + copied - 1; - - result = -EFAULT; - if (get_user(ch, lastp)) - goto out; - - /* Trim off the trailing newline */ - if (ch == '\n') { - result = -EFAULT; - if (put_user('\0', lastp)) - goto out; - copied -= 1; - } - } - - if (newval && newlen) { - loff_t pos = 0; - - result = vfs_write(file, newval, newlen, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static ssize_t bin_intvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - result = kernel_read(file, buffer, BUFSZ - 1, &pos); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned __user *vec = newval; - size_t length = newlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += scnprintf(str, end - str, "%lu\t", value); - } - - result = kernel_write(file, buffer, str - buffer, &pos); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_ulongvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned long __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - result = kernel_read(file, buffer, BUFSZ - 1, &pos); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned long __user *vec = newval; - size_t length = newlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += scnprintf(str, end - str, "%lu\t", value); - } - - result = kernel_write(file, buffer, str - buffer, &pos); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_uuid(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - /* Only supports reads */ - if (oldval && oldlen) { - char buf[UUID_STRING_LEN + 1]; - uuid_t uuid; - loff_t pos = 0; - - result = kernel_read(file, buf, sizeof(buf) - 1, &pos); - if (result < 0) - goto out; - - buf[result] = '\0'; - - result = -EIO; - if (uuid_parse(buf, &uuid)) - goto out; - - if (oldlen > 16) - oldlen = 16; - - result = -EFAULT; - if (copy_to_user(oldval, &uuid, oldlen)) - goto out; - - copied = oldlen; - } - result = copied; -out: - return result; -} - -static ssize_t bin_dn_node_address(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char buf[15], *nodep; - unsigned long area, node; - __le16 dnaddr; - loff_t pos = 0; - - result = kernel_read(file, buf, sizeof(buf) - 1, &pos); - if (result < 0) - goto out; - - buf[result] = '\0'; - - /* Convert the decnet address to binary */ - result = -EIO; - nodep = strchr(buf, '.'); - if (!nodep) - goto out; - ++nodep; - - area = simple_strtoul(buf, NULL, 10); - node = simple_strtoul(nodep, NULL, 10); - - result = -EIO; - if ((area > 63)||(node > 1023)) - goto out; - - dnaddr = cpu_to_le16((area << 10) | node); - - result = -EFAULT; - if (put_user(dnaddr, (__le16 __user *)oldval)) - goto out; - - copied = sizeof(dnaddr); - } - - if (newval && newlen) { - __le16 dnaddr; - char buf[15]; - int len; - loff_t pos = 0; - - result = -EINVAL; - if (newlen != sizeof(dnaddr)) - goto out; - - result = -EFAULT; - if (get_user(dnaddr, (__le16 __user *)newval)) - goto out; - - len = scnprintf(buf, sizeof(buf), "%hu.%hu", - le16_to_cpu(dnaddr) >> 10, - le16_to_cpu(dnaddr) & 0x3ff); - - result = kernel_write(file, buf, len, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) -{ - const struct bin_table *table = &bin_root_table[0]; - int ctl_name; - - /* The binary sysctl tables have a small maximum depth so - * there is no danger of overflowing our path as it PATH_MAX - * bytes long. - */ - memcpy(path, "sys/", 4); - path += 4; - -repeat: - if (!nlen) - return ERR_PTR(-ENOTDIR); - ctl_name = *name; - name++; - nlen--; - for ( ; table->convert; table++) { - int len = 0; - - /* - * For a wild card entry map from ifindex to network - * device name. - */ - if (!table->ctl_name) { -#ifdef CONFIG_NET - struct net *net = current->nsproxy->net_ns; - struct net_device *dev; - dev = dev_get_by_index(net, ctl_name); - if (dev) { - len = strlen(dev->name); - memcpy(path, dev->name, len); - dev_put(dev); - } -#endif - /* Use the well known sysctl number to proc name mapping */ - } else if (ctl_name == table->ctl_name) { - len = strlen(table->procname); - memcpy(path, table->procname, len); - } - if (len) { - path += len; - if (table->child) { - *path++ = '/'; - table = table->child; - goto repeat; - } - *path = '\0'; - return table; - } - } - return ERR_PTR(-ENOTDIR); -} - -static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) -{ - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - const struct bin_table *table = get_sysctl(name, nlen, tmp); - result = tmp; - *tablep = table; - if (IS_ERR(table)) { - __putname(tmp); - result = ERR_CAST(table); - } - } - return result; -} - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - const struct bin_table *table = NULL; - struct vfsmount *mnt; - struct file *file; - ssize_t result; - char *pathname; - int flags; - - pathname = sysctl_getname(name, nlen, &table); - result = PTR_ERR(pathname); - if (IS_ERR(pathname)) - goto out; - - /* How should the sysctl be accessed? */ - if (oldval && oldlen && newval && newlen) { - flags = O_RDWR; - } else if (newval && newlen) { - flags = O_WRONLY; - } else if (oldval && oldlen) { - flags = O_RDONLY; - } else { - result = 0; - goto out_putname; - } - - mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); - result = PTR_ERR(file); - if (IS_ERR(file)) - goto out_putname; - - result = table->convert(file, oldval, oldlen, newval, newlen); - - fput(file); -out_putname: - __putname(pathname); -out: - return result; -} - - -#else /* CONFIG_SYSCTL_SYSCALL */ - static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { return -ENOSYS; } -#endif /* CONFIG_SYSCTL_SYSCALL */ - - static void deprecated_sysctl_warning(const int *name, int nlen) { int i; -- cgit v1.2.3-59-g8ed1b From d0f010434124598988ba1c97fbb0e4e820ff5d8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 26 Nov 2019 15:01:06 -0800 Subject: bpf: Fix static checker warning kernel/bpf/btf.c:4023 btf_distill_func_proto() error: potentially dereferencing uninitialized 't'. kernel/bpf/btf.c 4012 nargs = btf_type_vlen(func); 4013 if (nargs >= MAX_BPF_FUNC_ARGS) { 4014 bpf_log(log, 4015 "The function %s has %d arguments. Too many.\n", 4016 tname, nargs); 4017 return -EINVAL; 4018 } 4019 ret = __get_type_size(btf, func->type, &t); ^^ t isn't initialized for the first -EINVAL return This is unlikely path, since BTF should have been validated at this point. Fix it by returning 'void' BTF. Reported-by: Dan Carpenter Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191126230106.237179-1-ast@kernel.org --- kernel/bpf/btf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 40efde5eedcb..bd5e11881ba3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3976,8 +3976,10 @@ static int __get_type_size(struct btf *btf, u32 btf_id, t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!t) + if (!t) { + *bad_type = btf->types[0]; return -EINVAL; + } if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); -- cgit v1.2.3-59-g8ed1b From ce27709b8162e5c501bc54292b8bf6bdecc4bbd4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 Nov 2019 20:35:08 -0800 Subject: bpf: Fix build in minimal configurations Some kconfigs can have BPF enabled without a single valid program type. In such configurations the build will fail with: ./kernel/bpf/btf.c:3466:1: error: empty enum is invalid Fix it by adding unused value to the enum. Reported-by: Randy Dunlap Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/bpf/20191128043508.2346723-1-ast@kernel.org --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd5e11881ba3..7d40da240891 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3463,6 +3463,7 @@ enum { __ctx_convert##_id, #include #undef BPF_PROG_TYPE + __ctx_convert_unused, /* to avoid empty enum in extreme .config */ }; static u8 bpf_ctx_convert_map[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ -- cgit v1.2.3-59-g8ed1b From 36a8015f89e40f7c9c91cc7e6d028fa288dad27b Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Tue, 26 Nov 2019 17:17:13 +0200 Subject: PM / QoS: Restore DEV_PM_QOS_MIN/MAX_FREQUENCY Support for adding per-device frequency limits was removed in commit 2aac8bdf7a0f ("PM: QoS: Drop frequency QoS types from device PM QoS") after cpufreq switched to use a new "freq_constraints" construct. Restore support for per-device freq limits but base this upon freq_constraints. This is primarily meant to be used by the devfreq subsystem. This removes the "static" marking on freq_qos_apply but does not export it for modules. Signed-off-by: Leonard Crestez Reviewed-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke Signed-off-by: Rafael J. Wysocki --- drivers/base/power/qos.c | 73 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/pm_qos.h | 12 ++++++++ kernel/power/qos.c | 4 ++- 3 files changed, 82 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c index 350dcafd751f..8e93167f1783 100644 --- a/drivers/base/power/qos.c +++ b/drivers/base/power/qos.c @@ -115,10 +115,20 @@ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) spin_lock_irqsave(&dev->power.lock, flags); - if (type == DEV_PM_QOS_RESUME_LATENCY) { + switch (type) { + case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); - } else { + break; + case DEV_PM_QOS_MIN_FREQUENCY: + ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE + : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); + break; + case DEV_PM_QOS_MAX_FREQUENCY: + ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE + : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); + break; + default: WARN_ON(1); ret = 0; } @@ -159,6 +169,10 @@ static int apply_constraint(struct dev_pm_qos_request *req, req->dev->power.set_latency_tolerance(req->dev, value); } break; + case DEV_PM_QOS_MIN_FREQUENCY: + case DEV_PM_QOS_MAX_FREQUENCY: + ret = freq_qos_apply(&req->data.freq, action, value); + break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); @@ -209,6 +223,8 @@ static int dev_pm_qos_constraints_allocate(struct device *dev) c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; + freq_constraints_init(&qos->freq); + INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); @@ -269,6 +285,20 @@ void dev_pm_qos_constraints_destroy(struct device *dev) memset(req, 0, sizeof(*req)); } + c = &qos->freq.min_freq; + plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { + apply_constraint(req, PM_QOS_REMOVE_REQ, + PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); + } + + c = &qos->freq.max_freq; + plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { + apply_constraint(req, PM_QOS_REMOVE_REQ, + PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); + } + f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); @@ -314,11 +344,22 @@ static int __dev_pm_qos_add_request(struct device *dev, ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); - if (!ret) { - req->dev = dev; - req->type = type; + if (ret) + return ret; + + req->dev = dev; + req->type = type; + if (req->type == DEV_PM_QOS_MIN_FREQUENCY) + ret = freq_qos_add_request(&dev->power.qos->freq, + &req->data.freq, + FREQ_QOS_MIN, value); + else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) + ret = freq_qos_add_request(&dev->power.qos->freq, + &req->data.freq, + FREQ_QOS_MAX, value); + else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); - } + return ret; } @@ -382,6 +423,10 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; + case DEV_PM_QOS_MIN_FREQUENCY: + case DEV_PM_QOS_MAX_FREQUENCY: + curr_value = req->data.freq.pnode.prio; + break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; @@ -507,6 +552,14 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; + case DEV_PM_QOS_MIN_FREQUENCY: + ret = freq_qos_add_notifier(&dev->power.qos->freq, + FREQ_QOS_MIN, notifier); + break; + case DEV_PM_QOS_MAX_FREQUENCY: + ret = freq_qos_add_notifier(&dev->power.qos->freq, + FREQ_QOS_MAX, notifier); + break; default: WARN_ON(1); ret = -EINVAL; @@ -546,6 +599,14 @@ int dev_pm_qos_remove_notifier(struct device *dev, ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; + case DEV_PM_QOS_MIN_FREQUENCY: + ret = freq_qos_remove_notifier(&dev->power.qos->freq, + FREQ_QOS_MIN, notifier); + break; + case DEV_PM_QOS_MAX_FREQUENCY: + ret = freq_qos_remove_notifier(&dev->power.qos->freq, + FREQ_QOS_MAX, notifier); + break; default: WARN_ON(1); ret = -EINVAL; diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 678fec6da5b9..19eafca5680e 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -34,6 +34,8 @@ enum pm_qos_flags_status { #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT PM_QOS_LATENCY_ANY #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS PM_QOS_LATENCY_ANY_NS #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 +#define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE 0 +#define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE FREQ_QOS_MAX_DEFAULT_VALUE #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) #define PM_QOS_FLAG_NO_POWER_OFF (1 << 0) @@ -101,6 +103,8 @@ struct freq_qos_request { enum dev_pm_qos_req_type { DEV_PM_QOS_RESUME_LATENCY = 1, DEV_PM_QOS_LATENCY_TOLERANCE, + DEV_PM_QOS_MIN_FREQUENCY, + DEV_PM_QOS_MAX_FREQUENCY, DEV_PM_QOS_FLAGS, }; @@ -109,6 +113,7 @@ struct dev_pm_qos_request { union { struct plist_node pnode; struct pm_qos_flags_request flr; + struct freq_qos_request freq; } data; struct device *dev; }; @@ -116,6 +121,7 @@ struct dev_pm_qos_request { struct dev_pm_qos { struct pm_qos_constraints resume_latency; struct pm_qos_constraints latency_tolerance; + struct freq_constraints freq; struct pm_qos_flags flags; struct dev_pm_qos_request *resume_latency_req; struct dev_pm_qos_request *latency_tolerance_req; @@ -214,6 +220,10 @@ static inline s32 dev_pm_qos_read_value(struct device *dev, switch (type) { case DEV_PM_QOS_RESUME_LATENCY: return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; + case DEV_PM_QOS_MIN_FREQUENCY: + return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE; + case DEV_PM_QOS_MAX_FREQUENCY: + return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE; default: WARN_ON(1); return 0; @@ -293,6 +303,8 @@ int freq_qos_add_request(struct freq_constraints *qos, enum freq_qos_req_type type, s32 value); int freq_qos_update_request(struct freq_qos_request *req, s32 new_value); int freq_qos_remove_request(struct freq_qos_request *req); +int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value); int freq_qos_add_notifier(struct freq_constraints *qos, enum freq_qos_req_type type, diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a45cba7df0ae..83edf8698118 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -714,8 +714,10 @@ s32 freq_qos_read_value(struct freq_constraints *qos, * @req: Constraint request to apply. * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. + * + * This is only meant to be called from inside pm_qos, not drivers. */ -static int freq_qos_apply(struct freq_qos_request *req, +int freq_qos_apply(struct freq_qos_request *req, enum pm_qos_req_action action, s32 value) { int ret; -- cgit v1.2.3-59-g8ed1b From ff68dac6d65cd1347dad5d780dd8c90f29dc1b0b Mon Sep 17 00:00:00 2001 From: Gaowei Pu Date: Sat, 30 Nov 2019 17:51:03 -0800 Subject: mm/mmap.c: use IS_ERR_VALUE to check return value of get_unmapped_area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_unmapped_area() returns an address or -errno on failure. Historically we have checked for the failure by offset_in_page() which is correct but quite hard to read. Newer code started using IS_ERR_VALUE which is much easier to read. Convert remaining users of offset_in_page as well. [mhocko@suse.com: rewrite changelog] [mhocko@kernel.org: fix mremap.c and uprobes.c sites also] Link: http://lkml.kernel.org/r/20191012102512.28051-1-pugaowei@gmail.com Signed-off-by: Gaowei Pu Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- mm/mmap.c | 9 +++++---- mm/mremap.c | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c74761004ee5..ece7e13f6e4a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { + if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } diff --git a/mm/mmap.c b/mm/mmap.c index 311b08f780ce..b9d0c2f3f6bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1417,7 +1417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (offset_in_page(addr)) + if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { @@ -2981,15 +2981,16 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; + unsigned long mapped_addr; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (offset_in_page(error)) - return error; + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; error = mlock_future_check(mm, mm->def_flags, len); if (error) diff --git a/mm/mremap.c b/mm/mremap.c index 1fc8a29fbe3f..122938dcec15 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -558,7 +558,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (offset_in_page(ret)) + if (IS_ERR_VALUE(ret)) goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, @@ -706,7 +706,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (offset_in_page(new_addr)) { + if (IS_ERR_VALUE(new_addr)) { ret = new_addr; goto out; } -- cgit v1.2.3-59-g8ed1b From eafb149ed73a8bb8359c0ce027b98acd4e95b070 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:57 -0800 Subject: fork: support VMAP_STACK with KASAN_VMALLOC Supporting VMAP_STACK with KASAN_VMALLOC is straightforward: - clear the shadow region of vmapped stacks when swapping them in - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN Link: http://lkml.kernel.org/r/20191031093909.9228-4-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 9 +++++---- kernel/fork.c | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 17c42bc36321..ec07f9ba1152 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -843,16 +843,17 @@ config HAVE_ARCH_VMAP_STACK config VMAP_STACK default y bool "Use a virtually-mapped stack" - depends on HAVE_ARCH_VMAP_STACK && !KASAN + depends on HAVE_ARCH_VMAP_STACK + depends on !KASAN || KASAN_VMALLOC ---help--- Enable this if you want the use virtually-mapped kernel stacks with guard pages. This causes kernel stack overflows to be caught immediately rather than causing difficult-to-diagnose corruption. - This is presently incompatible with KASAN because KASAN expects - the stack to map directly to the KASAN shadow map using a formula - that is incorrect if the stack is in vmalloc space. + To use this with KASAN, the architecture must support backing + virtual mappings with real shadow memory, and KASAN_VMALLOC must + be enabled. config ARCH_OPTIONAL_KERNEL_RWX def_bool n diff --git a/kernel/fork.c b/kernel/fork.c index 0f0bac8318dd..21c6c1e29b98 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; + /* Clear the KASAN shadow of the stack. */ + kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -- cgit v1.2.3-59-g8ed1b From 204cb79ad42f015312a5bbd7012d09c93d9b46fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:56:08 -0800 Subject: kernel: sysctl: make drop_caches write-only Currently, the drop_caches proc file and sysctl read back the last value written, suggesting this is somehow a stateful setting instead of a one-time command. Make it write-only, like e.g. compact_memory. While mitigating a VM problem at scale in our fleet, there was confusion about whether writing to this file will permanently switch the kernel into a non-caching mode. This influences the decision making in a tense situation, where tens of people are trying to fix tens of thousands of affected machines: Do we need a rollback strategy? What are the performance implications of operating in a non-caching state for several days? It also caused confusion when the kernel team said we may need to write the file several times to make sure it's effective ("But it already reads back 3?"). Link: http://lkml.kernel.org/r/20191031221602.9375-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Chris Down Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b6f2f35d0bcf..70665934d53e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, .extra2 = &four, -- cgit v1.2.3-59-g8ed1b From 6c3edaf9fd6a3be7fb5bc6931897c24cd3848f84 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Nov 2019 20:52:18 -0800 Subject: tracing: Introduce trace event injection We have been trying to use rasdaemon to monitor hardware errors like correctable memory errors. rasdaemon uses trace events to monitor various hardware errors. In order to test it, we have to inject some hardware errors, unfortunately not all of them provide error injections. MCE does provide a way to inject MCE errors, but errors like PCI error and devlink error don't, it is not easy to add error injection to each of them. Instead, it is relatively easier to just allow users to inject trace events in a generic way so that all trace events can be injected. This patch introduces trace event injection, where a new 'inject' is added to each tracepoint directory. Users could write into this file with key=value pairs to specify the value of each fields of the trace event, all unspecified fields are set to zero values by default. For example, for the net/net_dev_queue tracepoint, we can inject: INJECT=/sys/kernel/debug/tracing/events/net/net_dev_queue/inject echo "" > $INJECT echo "name='test'" > $INJECT echo "name='test' len=1024" > $INJECT cat /sys/kernel/debug/tracing/trace ... <...>-614 [000] .... 36.571483: net_dev_queue: dev= skbaddr=00000000fbf338c2 len=0 <...>-614 [001] .... 136.588252: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=0 <...>-614 [001] .N.. 208.431878: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=1024 Triggers could be triggered as usual too: echo "stacktrace if len == 1025" > /sys/kernel/debug/tracing/events/net/net_dev_queue/trigger echo "len=1025" > $INJECT cat /sys/kernel/debug/tracing/trace ... bash-614 [000] .... 36.571483: net_dev_queue: dev= skbaddr=00000000fbf338c2 len=0 bash-614 [001] .... 136.588252: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=0 bash-614 [001] .N.. 208.431878: net_dev_queue: dev=test skbaddr=00000000fbf338c2 len=1024 bash-614 [001] .N.1 284.236349: => event_inject_write => vfs_write => ksys_write => do_syscall_64 => entry_SYSCALL_64_after_hwframe The only thing that can't be injected is string pointers as they require constant string pointers, this can't be done at run time. Link: http://lkml.kernel.org/r/20191130045218.18979-1-xiyou.wangcong@gmail.com Cc: Ingo Molnar Signed-off-by: Cong Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 6 + kernel/trace/trace_events_inject.c | 331 +++++++++++++++++++++++++++++++++++++ 5 files changed, 348 insertions(+) create mode 100644 kernel/trace/trace_events_inject.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f67620499faa..29a9c5058b62 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -672,6 +672,15 @@ config HIST_TRIGGERS See Documentation/trace/histogram.rst. If in doubt, say N. +config TRACE_EVENT_INJECT + bool "Trace event injection" + depends on TRACING + help + Allow user-space to inject a specific trace event into the ring + buffer. This is mainly used for testing purpose. + + If unsure, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c2b2148bb1d2..0e63db62225f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ca7fccafbcbb..63bf60f79398 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1601,6 +1601,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6b3a69e9aa6a..c6de3cebc127 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2044,6 +2044,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg) + trace_create_file("inject", 0200, file->dir, file, + &event_inject_fops); +#endif + return 0; } diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c new file mode 100644 index 000000000000..d43710718ee5 --- /dev/null +++ b/kernel/trace/trace_events_inject.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_inject - trace event injection + * + * Copyright (C) 2019 Cong Wang + */ + +#include +#include +#include +#include +#include + +#include "trace.h" + +static int +trace_inject_entry(struct trace_event_file *file, void *rec, int len) +{ + struct trace_event_buffer fbuffer; + struct ring_buffer *buffer; + int written = 0; + void *entry; + + rcu_read_lock_sched(); + buffer = file->tr->trace_buffer.buffer; + entry = trace_event_buffer_reserve(&fbuffer, file, len); + if (entry) { + memcpy(entry, rec, len); + written = len; + trace_event_buffer_commit(&fbuffer); + } + rcu_read_unlock_sched(); + + return written; +} + +static int +parse_field(char *str, struct trace_event_call *call, + struct ftrace_event_field **pf, u64 *pv) +{ + struct ftrace_event_field *field; + char *field_name; + int s, i = 0; + int len; + u64 val; + + if (!str[i]) + return 0; + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; + while (isalnum(str[i]) || str[i] == '_') + i++; + len = i - s; + if (!len) + return -EINVAL; + + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) + return -ENOENT; + + *pf = field; + while (isspace(str[i])) + i++; + if (str[i] != '=') + return -EINVAL; + i++; + while (isspace(str[i])) + i++; + s = i; + if (isdigit(str[i]) || str[i] == '-') { + char *num, c; + int ret; + + /* Make sure the field is not a string */ + if (is_string_field(field)) + return -EINVAL; + + if (str[i] == '-') + i++; + + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + num = str + s; + c = str[i]; + if (c != '\0' && !isspace(c)) + return -EINVAL; + str[i] = '\0'; + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num, 0, &val); + else + ret = kstrtoull(num, 0, &val); + str[i] = c; + if (ret) + return ret; + + *pv = val; + return i; + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) + return -EINVAL; + + for (i++; str[i]; i++) { + if (str[i] == '\\' && str[i + 1]) { + i++; + continue; + } + if (str[i] == q) + break; + } + if (!str[i]) + return -EINVAL; + + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) + return -EINVAL; + + *pv = (unsigned long)(str + s); + str[i] = 0; + /* go past the last quote */ + i++; + return i; + } + + return -EINVAL; +} + +static int trace_get_entry_size(struct trace_event_call *call) +{ + struct ftrace_event_field *field; + struct list_head *head; + int size = 0; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (field->size + field->offset > size) + size = field->size + field->offset; + } + + return size; +} + +static void *trace_alloc_entry(struct trace_event_call *call, int *size) +{ + int entry_size = trace_get_entry_size(call); + struct ftrace_event_field *field; + struct list_head *head; + void *entry = NULL; + + /* We need an extra '\0' at the end. */ + entry = kzalloc(entry_size + 1, GFP_KERNEL); + if (!entry) + return NULL; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (!is_string_field(field)) + continue; + if (field->filter_type == FILTER_STATIC_STRING) + continue; + if (field->filter_type == FILTER_DYN_STRING) { + u32 *str_item; + int str_loc = entry_size & 0xffff; + + str_item = (u32 *)(entry + field->offset); + *str_item = str_loc; /* string length is 0. */ + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = ""; + } + } + + *size = entry_size + 1; + return entry; +} + +#define INJECT_STRING "STATIC STRING CAN NOT BE INJECTED" + +/* Caller is responsible to free the *pentry. */ +static int parse_entry(char *str, struct trace_event_call *call, void **pentry) +{ + struct ftrace_event_field *field; + unsigned long irq_flags; + void *entry = NULL; + int entry_size; + u64 val; + int len; + + entry = trace_alloc_entry(call, &entry_size); + *pentry = entry; + if (!entry) + return -ENOMEM; + + local_save_flags(irq_flags); + tracing_generic_entry_update(entry, call->event.type, irq_flags, + preempt_count()); + + while ((len = parse_field(str, call, &field, &val)) > 0) { + if (is_function_field(field)) + return -EINVAL; + + if (is_string_field(field)) { + char *addr = (char *)(unsigned long) val; + + if (field->filter_type == FILTER_STATIC_STRING) { + strlcpy(entry + field->offset, addr, field->size); + } else if (field->filter_type == FILTER_DYN_STRING) { + int str_len = strlen(addr) + 1; + int str_loc = entry_size & 0xffff; + u32 *str_item; + + entry_size += str_len; + *pentry = krealloc(entry, entry_size, GFP_KERNEL); + if (!*pentry) { + kfree(entry); + return -ENOMEM; + } + entry = *pentry; + + strlcpy(entry + (entry_size - str_len), addr, str_len); + str_item = (u32 *)(entry + field->offset); + *str_item = (str_len << 16) | str_loc; + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = INJECT_STRING; + } + } else { + switch (field->size) { + case 1: { + u8 tmp = (u8) val; + + memcpy(entry + field->offset, &tmp, 1); + break; + } + case 2: { + u16 tmp = (u16) val; + + memcpy(entry + field->offset, &tmp, 2); + break; + } + case 4: { + u32 tmp = (u32) val; + + memcpy(entry + field->offset, &tmp, 4); + break; + } + case 8: + memcpy(entry + field->offset, &val, 8); + break; + default: + return -EINVAL; + } + } + + str += len; + } + + if (len < 0) + return len; + + return entry_size; +} + +static ssize_t +event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_call *call; + struct trace_event_file *file; + int err = -ENODEV, size; + void *entry = NULL; + char *buf; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + strim(buf); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) { + call = file->event_call; + size = parse_entry(buf, call, &entry); + if (size < 0) + err = size; + else + err = trace_inject_entry(file, entry, size); + } + mutex_unlock(&event_mutex); + + kfree(entry); + kfree(buf); + + if (err < 0) + return err; + + *ppos += err; + return cnt; +} + +static ssize_t +event_inject_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + return -EPERM; +} + +const struct file_operations event_inject_fops = { + .open = tracing_open_generic, + .read = event_inject_read, + .write = event_inject_write, +}; -- cgit v1.2.3-59-g8ed1b From a356646a56857c2e5ad875beec734d7145ecd49a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Dec 2019 16:25:27 -0500 Subject: tracing: Do not create directories if lockdown is in affect If lockdown is disabling tracing on boot up, it prevents the tracing files from even bering created. But when that happens, there's several places that will give a warning that the files were not created as that is usually a sign of a bug. Add in strategic locations where a check is made to see if tracing is disabled by lockdown, and if it is, do not go further, and fail silently (but print that tracing is disabled by lockdown, without doing a WARN_ON()). Cc: Matthew Garrett Fixes: 17911ff38aa5 ("tracing: Add locked_down checks to the open calls of files created for tracefs") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 ++++++ kernel/trace/trace.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 66358d66c933..4bf050fcfe3b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include /* for self test */ @@ -5068,6 +5069,11 @@ static __init int test_ringbuffer(void) int cpu; int ret = 0; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Lockdown is enabled, skipping ring buffer tests\n"); + return 0; + } + pr_info("Running ring buffer tests...\n"); buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02a23a6e5e00..23459d53d576 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1888,6 +1888,12 @@ int __init register_tracer(struct tracer *type) return -1; } + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Can not register tracer %s due to lockdown\n", + type->name); + return -EPERM; + } + mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -8789,6 +8795,11 @@ struct dentry *tracing_init_dentry(void) { struct trace_array *tr = &global_trace; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Tracing disabled due to lockdown\n"); + return ERR_PTR(-EPERM); + } + /* The top level trace array uses NULL as parent */ if (tr->dir) return NULL; @@ -9231,6 +9242,12 @@ __init static int tracer_alloc_buffers(void) int ring_buf_size; int ret = -ENOMEM; + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warning("Tracing disabled due to lockdown\n"); + return -EPERM; + } + /* * Make sure we don't accidently add more trace options * than we have bits for. -- cgit v1.2.3-59-g8ed1b From 1a50cb80f219c44adb6265f5071b81fc3c1deced Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 4 Dec 2019 16:50:39 -0800 Subject: kernel/notifier.c: intercept duplicate registrations to avoid infinite loops Registering the same notifier to a hook repeatedly can cause the hook list to form a ring or lose other members of the list. case1: An infinite loop in notifier_chain_register() can cause soft lockup atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test2); case2: An infinite loop in notifier_chain_register() can cause soft lockup atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_call_chain(&test_notifier_list, 0, NULL); case3: lose other hook test2 atomic_notifier_chain_register(&test_notifier_list, &test1); atomic_notifier_chain_register(&test_notifier_list, &test2); atomic_notifier_chain_register(&test_notifier_list, &test1); case4: Unregister returns 0, but the hook is still in the linked list, and it is not really registered. If you call notifier_call_chain after ko is unloaded, it will trigger oops. If the system is configured with softlockup_panic and the same hook is repeatedly registered on the panic_notifier_list, it will cause a loop panic. Add a check in notifier_chain_register(), intercepting duplicate registrations to avoid infinite loops Link: http://lkml.kernel.org/r/1568861888-34045-2-git-send-email-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Vasily Averin Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Cc: Anna Schumaker Cc: Arjan van de Ven Cc: J. Bruce Fields Cc: Chuck Lever Cc: David S. Miller Cc: Jeff Layton Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Nadia Derbey Cc: "Paul E. McKenney" Cc: Sam Protsenko Cc: Alan Stern Cc: Thomas Gleixner Cc: Trond Myklebust Cc: Viresh Kumar Cc: Xiaoming Ni Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index d9f5081d578d..30bedb8be6dd 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -23,7 +23,10 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { - WARN_ONCE(((*nl) == n), "double register detected"); + if (unlikely((*nl) == n)) { + WARN(1, "double register detected"); + return 0; + } if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); -- cgit v1.2.3-59-g8ed1b From 5adaabb65a267d890b29193af2dbc38a3b85bbf2 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 4 Dec 2019 16:50:43 -0800 Subject: kernel/notifier.c: remove notifier_chain_cond_register() The only difference between notifier_chain_cond_register() and notifier_chain_register() is the lack of warning hints for duplicate registrations. Use notifier_chain_register() instead of notifier_chain_cond_register() to avoid duplicate code Link: http://lkml.kernel.org/r/1568861888-34045-3-git-send-email-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Andrew Morton Cc: Alan Stern Cc: Alexey Dobriyan Cc: Andy Lutomirski Cc: Anna Schumaker Cc: Arjan van de Ven Cc: Chuck Lever Cc: David S. Miller Cc: Ingo Molnar Cc: J. Bruce Fields Cc: Jeff Layton Cc: Nadia Derbey Cc: "Paul E. McKenney" Cc: Sam Protsenko Cc: Thomas Gleixner Cc: Trond Myklebust Cc: Vasily Averin Cc: Viresh Kumar Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 30bedb8be6dd..e3d221f092fe 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -36,21 +36,6 @@ static int notifier_chain_register(struct notifier_block **nl, return 0; } -static int notifier_chain_cond_register(struct notifier_block **nl, - struct notifier_block *n) -{ - while ((*nl) != NULL) { - if ((*nl) == n) - return 0; - if (n->priority > (*nl)->priority) - break; - nl = &((*nl)->next); - } - n->next = *nl; - rcu_assign_pointer(*nl, n); - return 0; -} - static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { @@ -252,7 +237,7 @@ int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, int ret; down_write(&nh->rwsem); - ret = notifier_chain_cond_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem); return ret; } -- cgit v1.2.3-59-g8ed1b From 260a2679e5cbfb3d8a4cf6cd1cb6f57e89c7e543 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 4 Dec 2019 16:50:47 -0800 Subject: kernel/notifier.c: remove blocking_notifier_chain_cond_register() blocking_notifier_chain_cond_register() does not consider system_booting state, which is the only difference between this function and blocking_notifier_cain_register(). This can be a bug and is a piece of duplicate code. Delete blocking_notifier_chain_cond_register() Link: http://lkml.kernel.org/r/1568861888-34045-4-git-send-email-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Andrew Morton Cc: Alan Stern Cc: Alexey Dobriyan Cc: Andy Lutomirski Cc: Anna Schumaker Cc: Arjan van de Ven Cc: Chuck Lever Cc: David S. Miller Cc: Ingo Molnar Cc: J. Bruce Fields Cc: Jeff Layton Cc: Nadia Derbey Cc: "Paul E. McKenney" Cc: Sam Protsenko Cc: Thomas Gleixner Cc: Trond Myklebust Cc: Vasily Averin Cc: Viresh Kumar Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/notifier.h | 4 ---- kernel/notifier.c | 23 ----------------------- net/sunrpc/rpc_pipe.c | 2 +- 3 files changed, 1 insertion(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 0096a05395e3..018947611483 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -150,10 +150,6 @@ extern int raw_notifier_chain_register(struct raw_notifier_head *nh, extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); -extern int blocking_notifier_chain_cond_register( - struct blocking_notifier_head *nh, - struct notifier_block *nb); - extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, diff --git a/kernel/notifier.c b/kernel/notifier.c index e3d221f092fe..63d7501ac638 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -220,29 +220,6 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); -/** - * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain, only if not already - * present in the chain. - * Must be called in process context. - * - * Currently always returns zero. - */ -int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, - struct notifier_block *n) -{ - int ret; - - down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); - up_write(&nh->rwsem); - return ret; -} -EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); - /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index b71a39ded930..39e14d5edaf1 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -51,7 +51,7 @@ static BLOCKING_NOTIFIER_HEAD(rpc_pipefs_notifier_list); int rpc_pipefs_notifier_register(struct notifier_block *nb) { - return blocking_notifier_chain_cond_register(&rpc_pipefs_notifier_list, nb); + return blocking_notifier_chain_register(&rpc_pipefs_notifier_list, nb); } EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register); -- cgit v1.2.3-59-g8ed1b From ef70eff9dea66f38f8c2c2dcc7fe4b7a2bbb4921 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 4 Dec 2019 16:50:50 -0800 Subject: kernel/profile.c: use cpumask_available to check for NULL cpumask When building with clang + -Wtautological-pointer-compare, these instances pop up: kernel/profile.c:339:6: warning: comparison of array 'prof_cpu_mask' not equal to a null pointer is always true [-Wtautological-pointer-compare] if (prof_cpu_mask != NULL) ^~~~~~~~~~~~~ ~~~~ kernel/profile.c:376:6: warning: comparison of array 'prof_cpu_mask' not equal to a null pointer is always true [-Wtautological-pointer-compare] if (prof_cpu_mask != NULL) ^~~~~~~~~~~~~ ~~~~ kernel/profile.c:406:26: warning: comparison of array 'prof_cpu_mask' not equal to a null pointer is always true [-Wtautological-pointer-compare] if (!user_mode(regs) && prof_cpu_mask != NULL && ^~~~~~~~~~~~~ ~~~~ 3 warnings generated. This can be addressed with the cpumask_available helper, introduced in commit f7e30f01a9e2 ("cpumask: Add helper cpumask_available()") to fix warnings like this while keeping the code the same. Link: https://github.com/ClangBuiltLinux/linux/issues/747 Link: http://lkml.kernel.org/r/20191022191957.9554-1-natechancellor@gmail.com Signed-off-by: Nathan Chancellor Reviewed-by: Andrew Morton Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index af7c94bf5fa1..4b144b02ca5d 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -336,7 +336,7 @@ static int profile_dead_cpu(unsigned int cpu) struct page *page; int i; - if (prof_cpu_mask != NULL) + if (cpumask_available(prof_cpu_mask)) cpumask_clear_cpu(cpu, prof_cpu_mask); for (i = 0; i < 2; i++) { @@ -373,7 +373,7 @@ static int profile_prepare_cpu(unsigned int cpu) static int profile_online_cpu(unsigned int cpu) { - if (prof_cpu_mask != NULL) + if (cpumask_available(prof_cpu_mask)) cpumask_set_cpu(cpu, prof_cpu_mask); return 0; @@ -403,7 +403,7 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && prof_cpu_mask != NULL && + if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); } -- cgit v1.2.3-59-g8ed1b From 5e1aada08cd19ea652b2d32a250501d09b02ff2e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Dec 2019 16:50:53 -0800 Subject: kernel/sys.c: avoid copying possible padding bytes in copy_to_user Initialization is not guaranteed to zero padding bytes so use an explicit memset instead to avoid leaking any kernel content in any possible padding bytes. Link: http://lkml.kernel.org/r/dfa331c00881d61c8ee51577a082d8bebd61805c.camel@perches.com Signed-off-by: Joe Perches Cc: Dan Carpenter Cc: Julia Lawall Cc: Thomas Gleixner Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d3aef31e24dc..a9331f101883 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1279,11 +1279,13 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - struct oldold_utsname tmp = {}; + struct oldold_utsname tmp; if (!name) return -EFAULT; + memset(&tmp, 0, sizeof(tmp)); + down_read(&uts_sem); memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); -- cgit v1.2.3-59-g8ed1b From 964975ac6677c97ae61ec9d6969dd5d03f18d1c3 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 4 Dec 2019 16:52:03 -0800 Subject: lib/genalloc.c: rename addr_in_gen_pool to gen_pool_has_addr Follow the kernel conventions, rename addr_in_gen_pool to gen_pool_has_addr. [sjhuang@iluvatar.ai: fix Documentation/ too] Link: http://lkml.kernel.org/r/20181229015914.5573-1-sjhuang@iluvatar.ai Link: http://lkml.kernel.org/r/20181228083950.20398-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Reviewed-by: Andrew Morton Cc: Russell King Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/genalloc.rst | 2 +- arch/arm/mm/dma-mapping.c | 2 +- drivers/misc/sram-exec.c | 2 +- include/linux/genalloc.h | 2 +- kernel/dma/remap.c | 2 +- lib/genalloc.c | 6 +++--- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst index 098a46f55798..a5af2cbf58a5 100644 --- a/Documentation/core-api/genalloc.rst +++ b/Documentation/core-api/genalloc.rst @@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future. :functions: gen_pool_for_each_chunk .. kernel-doc:: lib/genalloc.c - :functions: addr_in_gen_pool + :functions: gen_pool_has_addr .. kernel-doc:: lib/genalloc.c :functions: gen_pool_avail diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 1df6eb42f22e..e822af0d9219 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -529,7 +529,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page) static bool __in_atomic_pool(void *start, size_t size) { - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); } static int __free_from_pool(void *start, size_t size) diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c index 426ad912b441..d054e2842a5f 100644 --- a/drivers/misc/sram-exec.c +++ b/drivers/misc/sram-exec.c @@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src, if (!part) return NULL; - if (!addr_in_gen_pool(pool, (unsigned long)dst, size)) + if (!gen_pool_has_addr(pool, (unsigned long)dst, size)) return NULL; base = (unsigned long)part->base; diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 4bd583bd6934..5b14a0f38124 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -206,7 +206,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, int nid, const char *name); extern struct gen_pool *gen_pool_get(struct device *dev, const char *name); -bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, +extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start, size_t size); #ifdef CONFIG_OF diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d47bd40fc0f5..d14cbc83986a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -178,7 +178,7 @@ bool dma_in_atomic_pool(void *start, size_t size) if (unlikely(!atomic_pool)) return false; - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); + return gen_pool_has_addr(atomic_pool, (unsigned long)start, size); } void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags) diff --git a/lib/genalloc.c b/lib/genalloc.c index af9a57422186..7f1244b5294a 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -540,7 +540,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool, EXPORT_SYMBOL(gen_pool_for_each_chunk); /** - * addr_in_gen_pool - checks if an address falls within the range of a pool + * gen_pool_has_addr - checks if an address falls within the range of a pool * @pool: the generic memory pool * @start: start address * @size: size of the region @@ -548,7 +548,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk); * Check if the range of addresses falls within the specified pool. Returns * true if the entire range is contained in the pool and false otherwise. */ -bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, +bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start, size_t size) { bool found = false; @@ -567,7 +567,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, rcu_read_unlock(); return found; } -EXPORT_SYMBOL(addr_in_gen_pool); +EXPORT_SYMBOL(gen_pool_has_addr); /** * gen_pool_avail - get available free space of the pool -- cgit v1.2.3-59-g8ed1b From eec028c9386ed1a692aa01a85b55952202b41619 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 4 Dec 2019 16:52:43 -0800 Subject: kcov: remote coverage support Patch series " kcov: collect coverage from usb and vhost", v3. This patchset extends kcov to allow collecting coverage from backgound kernel threads. This extension requires custom annotations for each of the places where coverage collection is desired. This patchset implements this for hub events in the USB subsystem and for vhost workers. See the first patch description for details about the kcov extension. The other two patches apply this kcov extension to USB and vhost. Examples of other subsystems that might potentially benefit from this when custom annotations are added (the list is based on process_one_work() callers for bugs recently reported by syzbot): 1. fs: writeback wb_workfn() worker, 2. net: addrconf_dad_work()/addrconf_verify_work() workers, 3. net: neigh_periodic_work() worker, 4. net/p9: p9_write_work()/p9_read_work() workers, 5. block: blk_mq_run_work_fn() worker. These patches have been used to enable coverage-guided USB fuzzing with syzkaller for the last few years, see the details here: https://github.com/google/syzkaller/blob/master/docs/linux/external_fuzzing_usb.md This patchset has been pushed to the public Linux kernel Gerrit instance: https://linux-review.googlesource.com/c/linux/kernel/git/torvalds/linux/+/1524 This patch (of 3): Add background thread coverage collection ability to kcov. With KCOV_ENABLE coverage is collected only for syscalls that are issued from the current process. With KCOV_REMOTE_ENABLE it's possible to collect coverage for arbitrary parts of the kernel code, provided that those parts are annotated with kcov_remote_start()/kcov_remote_stop(). This allows to collect coverage from two types of kernel background threads: the global ones, that are spawned during kernel boot in a limited number of instances (e.g. one USB hub_event() worker thread is spawned per USB HCD); and the local ones, that are spawned when a user interacts with some kernel interface (e.g. vhost workers). To enable collecting coverage from a global background thread, a unique global handle must be assigned and passed to the corresponding kcov_remote_start() call. Then a userspace process can pass a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the kcov_remote_arg struct. This will attach the used kcov device to the code sections, that are referenced by those handles. Since there might be many local background threads spawned from different userspace processes, we can't use a single global handle per annotation. Instead, the userspace process passes a non-zero handle through the common_handle field of the kcov_remote_arg struct. This common handle gets saved to the kcov_handle field in the current task_struct and needs to be passed to the newly spawned threads via custom annotations. Those threads should in turn be annotated with kcov_remote_start()/kcov_remote_stop(). Internally kcov stores handles as u64 integers. The top byte of a handle is used to denote the id of a subsystem that this handle belongs to, and the lower 4 bytes are used to denote the id of a thread instance within that subsystem. A reserved value 0 is used as a subsystem id for common handles as they don't belong to a particular subsystem. The bytes 4-7 are currently reserved and must be zero. In the future the number of bytes used for the subsystem or handle ids might be increased. When a particular userspace process collects coverage by via a common handle, kcov will collect coverage for each code section that is annotated to use the common handle obtained as kcov_handle from the current task_struct. However non common handles allow to collect coverage selectively from different subsystems. Link: http://lkml.kernel.org/r/e90e315426a384207edbec1d6aa89e43008e4caf.1572366574.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Arnd Bergmann Cc: Steven Rostedt Cc: David Windsor Cc: Elena Reshetova Cc: Anders Roxell Cc: Alexander Potapenko Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kcov.rst | 129 +++++++++ include/linux/kcov.h | 23 ++ include/linux/sched.h | 8 + include/uapi/linux/kcov.h | 28 ++ kernel/kcov.c | 547 ++++++++++++++++++++++++++++++++++++--- 5 files changed, 700 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 42b612677799..36890b026e77 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -34,6 +34,7 @@ Profiling data will only become accessible once debugfs has been mounted:: Coverage collection ------------------- + The following program demonstrates coverage collection from within a test program using kcov: @@ -128,6 +129,7 @@ only need to enable coverage (disable happens automatically on thread end). Comparison operands collection ------------------------------ + Comparison operands collection is similar to coverage collection: .. code-block:: c @@ -202,3 +204,130 @@ Comparison operands collection is similar to coverage collection: Note that the kcov modes (coverage collection or comparison operands) are mutually exclusive. + +Remote coverage collection +-------------------------- + +With KCOV_ENABLE coverage is collected only for syscalls that are issued +from the current process. With KCOV_REMOTE_ENABLE it's possible to collect +coverage for arbitrary parts of the kernel code, provided that those parts +are annotated with kcov_remote_start()/kcov_remote_stop(). + +This allows to collect coverage from two types of kernel background +threads: the global ones, that are spawned during kernel boot in a limited +number of instances (e.g. one USB hub_event() worker thread is spawned per +USB HCD); and the local ones, that are spawned when a user interacts with +some kernel interface (e.g. vhost workers). + +To enable collecting coverage from a global background thread, a unique +global handle must be assigned and passed to the corresponding +kcov_remote_start() call. Then a userspace process can pass a list of such +handles to the KCOV_REMOTE_ENABLE ioctl in the handles array field of the +kcov_remote_arg struct. This will attach the used kcov device to the code +sections, that are referenced by those handles. + +Since there might be many local background threads spawned from different +userspace processes, we can't use a single global handle per annotation. +Instead, the userspace process passes a non-zero handle through the +common_handle field of the kcov_remote_arg struct. This common handle gets +saved to the kcov_handle field in the current task_struct and needs to be +passed to the newly spawned threads via custom annotations. Those threads +should in turn be annotated with kcov_remote_start()/kcov_remote_stop(). + +Internally kcov stores handles as u64 integers. The top byte of a handle +is used to denote the id of a subsystem that this handle belongs to, and +the lower 4 bytes are used to denote the id of a thread instance within +that subsystem. A reserved value 0 is used as a subsystem id for common +handles as they don't belong to a particular subsystem. The bytes 4-7 are +currently reserved and must be zero. In the future the number of bytes +used for the subsystem or handle ids might be increased. + +When a particular userspace proccess collects coverage by via a common +handle, kcov will collect coverage for each code section that is annotated +to use the common handle obtained as kcov_handle from the current +task_struct. However non common handles allow to collect coverage +selectively from different subsystems. + +.. code-block:: c + + struct kcov_remote_arg { + unsigned trace_mode; + unsigned area_size; + unsigned num_handles; + uint64_t common_handle; + uint64_t handles[0]; + }; + + #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) + #define KCOV_DISABLE _IO('c', 101) + #define KCOV_REMOTE_ENABLE _IOW('c', 102, struct kcov_remote_arg) + + #define COVER_SIZE (64 << 10) + + #define KCOV_TRACE_PC 0 + + #define KCOV_SUBSYSTEM_COMMON (0x00ull << 56) + #define KCOV_SUBSYSTEM_USB (0x01ull << 56) + + #define KCOV_SUBSYSTEM_MASK (0xffull << 56) + #define KCOV_INSTANCE_MASK (0xffffffffull) + + static inline __u64 kcov_remote_handle(__u64 subsys, __u64 inst) + { + if (subsys & ~KCOV_SUBSYSTEM_MASK || inst & ~KCOV_INSTANCE_MASK) + return 0; + return subsys | inst; + } + + #define KCOV_COMMON_ID 0x42 + #define KCOV_USB_BUS_NUM 1 + + int main(int argc, char **argv) + { + int fd; + unsigned long *cover, n, i; + struct kcov_remote_arg *arg; + + fd = open("/sys/kernel/debug/kcov", O_RDWR); + if (fd == -1) + perror("open"), exit(1); + if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE)) + perror("ioctl"), exit(1); + cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if ((void*)cover == MAP_FAILED) + perror("mmap"), exit(1); + + /* Enable coverage collection via common handle and from USB bus #1. */ + arg = calloc(1, sizeof(*arg) + sizeof(uint64_t)); + if (!arg) + perror("calloc"), exit(1); + arg->trace_mode = KCOV_TRACE_PC; + arg->area_size = COVER_SIZE; + arg->num_handles = 1; + arg->common_handle = kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, + KCOV_COMMON_ID); + arg->handles[0] = kcov_remote_handle(KCOV_SUBSYSTEM_USB, + KCOV_USB_BUS_NUM); + if (ioctl(fd, KCOV_REMOTE_ENABLE, arg)) + perror("ioctl"), free(arg), exit(1); + free(arg); + + /* + * Here the user needs to trigger execution of a kernel code section + * that is either annotated with the common handle, or to trigger some + * activity on USB bus #1. + */ + sleep(2); + + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) + printf("0x%lx\n", cover[i + 1]); + if (ioctl(fd, KCOV_DISABLE, 0)) + perror("ioctl"), exit(1); + if (munmap(cover, COVER_SIZE * sizeof(unsigned long))) + perror("munmap"), exit(1); + if (close(fd)) + perror("close"), exit(1); + return 0; + } diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b76a1807028d..a10e84707d82 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -37,12 +37,35 @@ do { \ (t)->kcov_mode &= ~KCOV_IN_CTXSW; \ } while (0) +/* See Documentation/dev-tools/kcov.rst for usage details. */ +void kcov_remote_start(u64 handle); +void kcov_remote_stop(void); +u64 kcov_common_handle(void); + +static inline void kcov_remote_start_common(u64 id) +{ + kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, id)); +} + +static inline void kcov_remote_start_usb(u64 id) +{ + kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id)); +} + #else static inline void kcov_task_init(struct task_struct *t) {} static inline void kcov_task_exit(struct task_struct *t) {} static inline void kcov_prepare_switch(struct task_struct *t) {} static inline void kcov_finish_switch(struct task_struct *t) {} +static inline void kcov_remote_start(u64 handle) {} +static inline void kcov_remote_stop(void) {} +static inline u64 kcov_common_handle(void) +{ + return 0; +} +static inline void kcov_remote_start_common(u64 id) {} +static inline void kcov_remote_start_usb(u64 id) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 0cd97d9dd021..467d26046416 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1210,6 +1210,8 @@ struct task_struct { #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV + /* See kernel/kcov.c for more details. */ + /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; @@ -1221,6 +1223,12 @@ struct task_struct { /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; + + /* KCOV common handle for remote coverage collection: */ + u64 kcov_handle; + + /* KCOV sequence number: */ + int kcov_sequence; #endif #ifdef CONFIG_MEMCG diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h index 9529867717a8..409d3ad1e6e2 100644 --- a/include/uapi/linux/kcov.h +++ b/include/uapi/linux/kcov.h @@ -4,9 +4,24 @@ #include +/* + * Argument for KCOV_REMOTE_ENABLE ioctl, see Documentation/dev-tools/kcov.rst + * and the comment before kcov_remote_start() for usage details. + */ +struct kcov_remote_arg { + unsigned int trace_mode; /* KCOV_TRACE_PC or KCOV_TRACE_CMP */ + unsigned int area_size; /* Length of coverage buffer in words */ + unsigned int num_handles; /* Size of handles array */ + __u64 common_handle; + __u64 handles[0]; +}; + +#define KCOV_REMOTE_MAX_HANDLES 0x100 + #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) #define KCOV_ENABLE _IO('c', 100) #define KCOV_DISABLE _IO('c', 101) +#define KCOV_REMOTE_ENABLE _IOW('c', 102, struct kcov_remote_arg) enum { /* @@ -32,4 +47,17 @@ enum { #define KCOV_CMP_SIZE(n) ((n) << 1) #define KCOV_CMP_MASK KCOV_CMP_SIZE(3) +#define KCOV_SUBSYSTEM_COMMON (0x00ull << 56) +#define KCOV_SUBSYSTEM_USB (0x01ull << 56) + +#define KCOV_SUBSYSTEM_MASK (0xffull << 56) +#define KCOV_INSTANCE_MASK (0xffffffffull) + +static inline __u64 kcov_remote_handle(__u64 subsys, __u64 inst) +{ + if (subsys & ~KCOV_SUBSYSTEM_MASK || inst & ~KCOV_INSTANCE_MASK) + return 0; + return subsys | inst; +} + #endif /* _LINUX_KCOV_IOCTLS_H */ diff --git a/kernel/kcov.c b/kernel/kcov.c index 2ee38727844a..f50354202dbe 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -21,8 +22,11 @@ #include #include #include +#include #include +#define kcov_debug(fmt, ...) pr_debug("%s: " fmt, __func__, ##__VA_ARGS__) + /* Number of 64-bit words written per one comparison: */ #define KCOV_WORDS_PER_CMP 4 @@ -44,19 +48,100 @@ struct kcov { * Reference counter. We keep one for: * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) + * - each code section for remote coverage collection */ refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; - /* Size of arena (in long's for KCOV_MODE_TRACE). */ - unsigned size; + /* Size of arena (in long's). */ + unsigned int size; /* Coverage buffer shared with user space. */ void *area; /* Task for which we collect coverage, or NULL. */ struct task_struct *t; + /* Collecting coverage from remote (background) threads. */ + bool remote; + /* Size of remote area (in long's). */ + unsigned int remote_size; + /* + * Sequence is incremented each time kcov is reenabled, used by + * kcov_remote_stop(), see the comment there. + */ + int sequence; }; +struct kcov_remote_area { + struct list_head list; + unsigned int size; +}; + +struct kcov_remote { + u64 handle; + struct kcov *kcov; + struct hlist_node hnode; +}; + +static DEFINE_SPINLOCK(kcov_remote_lock); +static DEFINE_HASHTABLE(kcov_remote_map, 4); +static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); + +/* Must be called with kcov_remote_lock locked. */ +static struct kcov_remote *kcov_remote_find(u64 handle) +{ + struct kcov_remote *remote; + + hash_for_each_possible(kcov_remote_map, remote, hnode, handle) { + if (remote->handle == handle) + return remote; + } + return NULL; +} + +static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle) +{ + struct kcov_remote *remote; + + if (kcov_remote_find(handle)) + return ERR_PTR(-EEXIST); + remote = kmalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) + return ERR_PTR(-ENOMEM); + remote->handle = handle; + remote->kcov = kcov; + hash_add(kcov_remote_map, &remote->hnode, handle); + return remote; +} + +/* Must be called with kcov_remote_lock locked. */ +static struct kcov_remote_area *kcov_remote_area_get(unsigned int size) +{ + struct kcov_remote_area *area; + struct list_head *pos; + + kcov_debug("size = %u\n", size); + list_for_each(pos, &kcov_remote_areas) { + area = list_entry(pos, struct kcov_remote_area, list); + if (area->size == size) { + list_del(&area->list); + kcov_debug("rv = %px\n", area); + return area; + } + } + kcov_debug("rv = NULL\n"); + return NULL; +} + +/* Must be called with kcov_remote_lock locked. */ +static void kcov_remote_area_put(struct kcov_remote_area *area, + unsigned int size) +{ + kcov_debug("area = %px, size = %u\n", area, size); + INIT_LIST_HEAD(&area->list); + area->size = size; + list_add(&area->list, &kcov_remote_areas); +} + static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -73,7 +158,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru * in_interrupt() returns false (e.g. preempt_schedule_irq()). * READ_ONCE()/barrier() effectively provides load-acquire wrt * interrupts, there are paired barrier()/WRITE_ONCE() in - * kcov_ioctl_locked(). + * kcov_start(). */ barrier(); return mode == needed_mode; @@ -227,6 +312,78 @@ void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) EXPORT_SYMBOL(__sanitizer_cov_trace_switch); #endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ +static void kcov_start(struct task_struct *t, unsigned int size, + void *area, enum kcov_mode mode, int sequence) +{ + kcov_debug("t = %px, size = %u, area = %px\n", t, size, area); + /* Cache in task struct for performance. */ + t->kcov_size = size; + t->kcov_area = area; + /* See comment in check_kcov_mode(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, mode); + t->kcov_sequence = sequence; +} + +static void kcov_stop(struct task_struct *t) +{ + WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); + barrier(); + t->kcov_size = 0; + t->kcov_area = NULL; +} + +static void kcov_task_reset(struct task_struct *t) +{ + kcov_stop(t); + t->kcov = NULL; + t->kcov_sequence = 0; + t->kcov_handle = 0; +} + +void kcov_task_init(struct task_struct *t) +{ + kcov_task_reset(t); + t->kcov_handle = current->kcov_handle; +} + +static void kcov_reset(struct kcov *kcov) +{ + kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; + kcov->remote = false; + kcov->remote_size = 0; + kcov->sequence++; +} + +static void kcov_remote_reset(struct kcov *kcov) +{ + int bkt; + struct kcov_remote *remote; + struct hlist_node *tmp; + + spin_lock(&kcov_remote_lock); + hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) { + if (remote->kcov != kcov) + continue; + kcov_debug("removing handle %llx\n", remote->handle); + hash_del(&remote->hnode); + kfree(remote); + } + /* Do reset before unlock to prevent races with kcov_remote_start(). */ + kcov_reset(kcov); + spin_unlock(&kcov_remote_lock); +} + +static void kcov_disable(struct task_struct *t, struct kcov *kcov) +{ + kcov_task_reset(t); + if (kcov->remote) + kcov_remote_reset(kcov); + else + kcov_reset(kcov); +} + static void kcov_get(struct kcov *kcov) { refcount_inc(&kcov->refcount); @@ -235,20 +392,12 @@ static void kcov_get(struct kcov *kcov) static void kcov_put(struct kcov *kcov) { if (refcount_dec_and_test(&kcov->refcount)) { + kcov_remote_reset(kcov); vfree(kcov->area); kfree(kcov); } } -void kcov_task_init(struct task_struct *t) -{ - WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); - barrier(); - t->kcov_size = 0; - t->kcov_area = NULL; - t->kcov = NULL; -} - void kcov_task_exit(struct task_struct *t) { struct kcov *kcov; @@ -256,15 +405,36 @@ void kcov_task_exit(struct task_struct *t) kcov = t->kcov; if (kcov == NULL) return; + spin_lock(&kcov->lock); + kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t); + /* + * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t, + * which comes down to: + * WARN_ON(!kcov->remote && kcov->t != t); + * + * For KCOV_REMOTE_ENABLE devices, the exiting task is either: + * 2. A remote task between kcov_remote_start() and kcov_remote_stop(). + * In this case we should print a warning right away, since a task + * shouldn't be exiting when it's in a kcov coverage collection + * section. Here t points to the task that is collecting remote + * coverage, and t->kcov->t points to the thread that created the + * kcov device. Which means that to detect this case we need to + * check that t != t->kcov->t, and this gives us the following: + * WARN_ON(kcov->remote && kcov->t != t); + * + * 2. The task that created kcov exiting without calling KCOV_DISABLE, + * and then again we can make sure that t->kcov->t == t: + * WARN_ON(kcov->remote && kcov->t != t); + * + * By combining all three checks into one we get: + */ if (WARN_ON(kcov->t != t)) { spin_unlock(&kcov->lock); return; } /* Just to not leave dangling references behind. */ - kcov_task_init(t); - kcov->t = NULL; - kcov->mode = KCOV_MODE_INIT; + kcov_disable(t, kcov); spin_unlock(&kcov->lock); kcov_put(kcov); } @@ -313,6 +483,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; + kcov->sequence = 1; refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; @@ -325,6 +496,20 @@ static int kcov_close(struct inode *inode, struct file *filep) return 0; } +static int kcov_get_mode(unsigned long arg) +{ + if (arg == KCOV_TRACE_PC) + return KCOV_MODE_TRACE_PC; + else if (arg == KCOV_TRACE_CMP) +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS + return KCOV_MODE_TRACE_CMP; +#else + return -ENOTSUPP; +#endif + else + return -EINVAL; +} + /* * Fault in a lazily-faulted vmalloc area before it can be used by * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the @@ -340,14 +525,35 @@ static void kcov_fault_in_area(struct kcov *kcov) READ_ONCE(area[offset]); } +static inline bool kcov_check_handle(u64 handle, bool common_valid, + bool uncommon_valid, bool zero_valid) +{ + if (handle & ~(KCOV_SUBSYSTEM_MASK | KCOV_INSTANCE_MASK)) + return false; + switch (handle & KCOV_SUBSYSTEM_MASK) { + case KCOV_SUBSYSTEM_COMMON: + return (handle & KCOV_INSTANCE_MASK) ? + common_valid : zero_valid; + case KCOV_SUBSYSTEM_USB: + return uncommon_valid; + default: + return false; + } + return false; +} + static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; unsigned long size, unused; + int mode, i; + struct kcov_remote_arg *remote_arg; + struct kcov_remote *remote; switch (cmd) { case KCOV_INIT_TRACE: + kcov_debug("KCOV_INIT_TRACE\n"); /* * Enable kcov in trace mode and setup buffer size. * Must happen before anything else. @@ -366,6 +572,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: + kcov_debug("KCOV_ENABLE\n"); /* * Enable coverage for the current task. * At this point user must have been enabled trace mode, @@ -378,29 +585,20 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, t = current; if (kcov->t != NULL || t->kcov != NULL) return -EBUSY; - if (arg == KCOV_TRACE_PC) - kcov->mode = KCOV_MODE_TRACE_PC; - else if (arg == KCOV_TRACE_CMP) -#ifdef CONFIG_KCOV_ENABLE_COMPARISONS - kcov->mode = KCOV_MODE_TRACE_CMP; -#else - return -ENOTSUPP; -#endif - else - return -EINVAL; + mode = kcov_get_mode(arg); + if (mode < 0) + return mode; kcov_fault_in_area(kcov); - /* Cache in task struct for performance. */ - t->kcov_size = kcov->size; - t->kcov_area = kcov->area; - /* See comment in check_kcov_mode(). */ - barrier(); - WRITE_ONCE(t->kcov_mode, kcov->mode); + kcov->mode = mode; + kcov_start(t, kcov->size, kcov->area, kcov->mode, + kcov->sequence); t->kcov = kcov; kcov->t = t; - /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ kcov_get(kcov); return 0; case KCOV_DISABLE: + kcov_debug("KCOV_DISABLE\n"); /* Disable coverage for the current task. */ unused = arg; if (unused != 0 || current->kcov != kcov) @@ -408,11 +606,65 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, t = current; if (WARN_ON(kcov->t != t)) return -EINVAL; - kcov_task_init(t); - kcov->t = NULL; - kcov->mode = KCOV_MODE_INIT; + kcov_disable(t, kcov); kcov_put(kcov); return 0; + case KCOV_REMOTE_ENABLE: + kcov_debug("KCOV_REMOTE_ENABLE\n"); + if (kcov->mode != KCOV_MODE_INIT || !kcov->area) + return -EINVAL; + t = current; + if (kcov->t != NULL || t->kcov != NULL) + return -EBUSY; + remote_arg = (struct kcov_remote_arg *)arg; + mode = kcov_get_mode(remote_arg->trace_mode); + if (mode < 0) + return mode; + if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->mode = mode; + t->kcov = kcov; + kcov->t = t; + kcov->remote = true; + kcov->remote_size = remote_arg->area_size; + spin_lock(&kcov_remote_lock); + for (i = 0; i < remote_arg->num_handles; i++) { + kcov_debug("handle %llx\n", remote_arg->handles[i]); + if (!kcov_check_handle(remote_arg->handles[i], + false, true, false)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return -EINVAL; + } + remote = kcov_remote_add(kcov, remote_arg->handles[i]); + if (IS_ERR(remote)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return PTR_ERR(remote); + } + } + if (remote_arg->common_handle) { + kcov_debug("common handle %llx\n", + remote_arg->common_handle); + if (!kcov_check_handle(remote_arg->common_handle, + true, false, false)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return -EINVAL; + } + remote = kcov_remote_add(kcov, + remote_arg->common_handle); + if (IS_ERR(remote)) { + spin_unlock(&kcov_remote_lock); + kcov_disable(t, kcov); + return PTR_ERR(remote); + } + t->kcov_handle = remote_arg->common_handle; + } + spin_unlock(&kcov_remote_lock); + /* Put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; default: return -ENOTTY; } @@ -422,11 +674,35 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct kcov *kcov; int res; + struct kcov_remote_arg *remote_arg = NULL; + unsigned int remote_num_handles; + unsigned long remote_arg_size; + + if (cmd == KCOV_REMOTE_ENABLE) { + if (get_user(remote_num_handles, (unsigned __user *)(arg + + offsetof(struct kcov_remote_arg, num_handles)))) + return -EFAULT; + if (remote_num_handles > KCOV_REMOTE_MAX_HANDLES) + return -EINVAL; + remote_arg_size = struct_size(remote_arg, handles, + remote_num_handles); + remote_arg = memdup_user((void __user *)arg, remote_arg_size); + if (IS_ERR(remote_arg)) + return PTR_ERR(remote_arg); + if (remote_arg->num_handles != remote_num_handles) { + kfree(remote_arg); + return -EINVAL; + } + arg = (unsigned long)remote_arg; + } kcov = filep->private_data; spin_lock(&kcov->lock); res = kcov_ioctl_locked(kcov, cmd, arg); spin_unlock(&kcov->lock); + + kfree(remote_arg); + return res; } @@ -438,6 +714,207 @@ static const struct file_operations kcov_fops = { .release = kcov_close, }; +/* + * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section + * of code in a kernel background thread to allow kcov to be used to collect + * coverage from that part of code. + * + * The handle argument of kcov_remote_start() identifies a code section that is + * used for coverage collection. A userspace process passes this handle to + * KCOV_REMOTE_ENABLE ioctl to make the used kcov device start collecting + * coverage for the code section identified by this handle. + * + * The usage of these annotations in the kernel code is different depending on + * the type of the kernel thread whose code is being annotated. + * + * For global kernel threads that are spawned in a limited number of instances + * (e.g. one USB hub_event() worker thread is spawned per USB HCD), each + * instance must be assigned a unique 4-byte instance id. The instance id is + * then combined with a 1-byte subsystem id to get a handle via + * kcov_remote_handle(subsystem_id, instance_id). + * + * For local kernel threads that are spawned from system calls handler when a + * user interacts with some kernel interface (e.g. vhost workers), a handle is + * passed from a userspace process as the common_handle field of the + * kcov_remote_arg struct (note, that the user must generate a handle by using + * kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an + * arbitrary 4-byte non-zero number as the instance id). This common handle + * then gets saved into the task_struct of the process that issued the + * KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn + * kernel threads, the common handle must be retrived via kcov_common_handle() + * and passed to the spawned threads via custom annotations. Those kernel + * threads must in turn be annotated with kcov_remote_start(common_handle) and + * kcov_remote_stop(). All of the threads that are spawned by the same process + * obtain the same handle, hence the name "common". + * + * See Documentation/dev-tools/kcov.rst for more details. + * + * Internally, this function looks up the kcov device associated with the + * provided handle, allocates an area for coverage collection, and saves the + * pointers to kcov and area into the current task_struct to allow coverage to + * be collected via __sanitizer_cov_trace_pc() + * In turns kcov_remote_stop() clears those pointers from task_struct to stop + * collecting coverage and copies all collected coverage into the kcov area. + */ +void kcov_remote_start(u64 handle) +{ + struct kcov_remote *remote; + void *area; + struct task_struct *t; + unsigned int size; + enum kcov_mode mode; + int sequence; + + if (WARN_ON(!kcov_check_handle(handle, true, true, true))) + return; + if (WARN_ON(!in_task())) + return; + t = current; + /* + * Check that kcov_remote_start is not called twice + * nor called by user tasks (with enabled kcov). + */ + if (WARN_ON(t->kcov)) + return; + + kcov_debug("handle = %llx\n", handle); + + spin_lock(&kcov_remote_lock); + remote = kcov_remote_find(handle); + if (!remote) { + kcov_debug("no remote found"); + spin_unlock(&kcov_remote_lock); + return; + } + /* Put in kcov_remote_stop(). */ + kcov_get(remote->kcov); + t->kcov = remote->kcov; + /* + * Read kcov fields before unlock to prevent races with + * KCOV_DISABLE / kcov_remote_reset(). + */ + size = remote->kcov->remote_size; + mode = remote->kcov->mode; + sequence = remote->kcov->sequence; + area = kcov_remote_area_get(size); + spin_unlock(&kcov_remote_lock); + + if (!area) { + area = vmalloc(size * sizeof(unsigned long)); + if (!area) { + t->kcov = NULL; + kcov_put(remote->kcov); + return; + } + } + /* Reset coverage size. */ + *(u64 *)area = 0; + + kcov_debug("area = %px, size = %u", area, size); + + kcov_start(t, size, area, mode, sequence); + +} +EXPORT_SYMBOL(kcov_remote_start); + +static void kcov_move_area(enum kcov_mode mode, void *dst_area, + unsigned int dst_area_size, void *src_area) +{ + u64 word_size = sizeof(unsigned long); + u64 count_size, entry_size_log; + u64 dst_len, src_len; + void *dst_entries, *src_entries; + u64 dst_occupied, dst_free, bytes_to_move, entries_moved; + + kcov_debug("%px %u <= %px %lu\n", + dst_area, dst_area_size, src_area, *(unsigned long *)src_area); + + switch (mode) { + case KCOV_MODE_TRACE_PC: + dst_len = READ_ONCE(*(unsigned long *)dst_area); + src_len = *(unsigned long *)src_area; + count_size = sizeof(unsigned long); + entry_size_log = __ilog2_u64(sizeof(unsigned long)); + break; + case KCOV_MODE_TRACE_CMP: + dst_len = READ_ONCE(*(u64 *)dst_area); + src_len = *(u64 *)src_area; + count_size = sizeof(u64); + BUILD_BUG_ON(!is_power_of_2(KCOV_WORDS_PER_CMP)); + entry_size_log = __ilog2_u64(sizeof(u64) * KCOV_WORDS_PER_CMP); + break; + default: + WARN_ON(1); + return; + } + + /* As arm can't divide u64 integers use log of entry size. */ + if (dst_len > ((dst_area_size * word_size - count_size) >> + entry_size_log)) + return; + dst_occupied = count_size + (dst_len << entry_size_log); + dst_free = dst_area_size * word_size - dst_occupied; + bytes_to_move = min(dst_free, src_len << entry_size_log); + dst_entries = dst_area + dst_occupied; + src_entries = src_area + count_size; + memcpy(dst_entries, src_entries, bytes_to_move); + entries_moved = bytes_to_move >> entry_size_log; + + switch (mode) { + case KCOV_MODE_TRACE_PC: + WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); + break; + case KCOV_MODE_TRACE_CMP: + WRITE_ONCE(*(u64 *)dst_area, dst_len + entries_moved); + break; + default: + break; + } +} + +/* See the comment before kcov_remote_start() for usage details. */ +void kcov_remote_stop(void) +{ + struct task_struct *t = current; + struct kcov *kcov = t->kcov; + void *area = t->kcov_area; + unsigned int size = t->kcov_size; + int sequence = t->kcov_sequence; + + if (!kcov) { + kcov_debug("no kcov found\n"); + return; + } + + kcov_stop(t); + t->kcov = NULL; + + spin_lock(&kcov->lock); + /* + * KCOV_DISABLE could have been called between kcov_remote_start() + * and kcov_remote_stop(), hence the check. + */ + kcov_debug("move if: %d == %d && %d\n", + sequence, kcov->sequence, (int)kcov->remote); + if (sequence == kcov->sequence && kcov->remote) + kcov_move_area(kcov->mode, kcov->area, kcov->size, area); + spin_unlock(&kcov->lock); + + spin_lock(&kcov_remote_lock); + kcov_remote_area_put(area, size); + spin_unlock(&kcov_remote_lock); + + kcov_put(kcov); +} +EXPORT_SYMBOL(kcov_remote_stop); + +/* See the comment before kcov_remote_start() for usage details. */ +u64 kcov_common_handle(void) +{ + return current->kcov_handle; +} +EXPORT_SYMBOL(kcov_common_handle); + static int __init kcov_init(void) { /* -- cgit v1.2.3-59-g8ed1b From e9eeec58c992c47b394e4f829e4f81b923b0a322 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 4 Dec 2019 17:06:06 -0800 Subject: bpf: Fix a bug when getting subprog 0 jited image in check_attach_btf_id For jited bpf program, if the subprogram count is 1, i.e., there is no callees in the program, prog->aux->func will be NULL and prog->bpf_func points to image address of the program. If there is more than one subprogram, prog->aux->func is populated, and subprogram 0 can be accessed through either prog->bpf_func or prog->aux->func[0]. Other subprograms should be accessed through prog->aux->func[subprog_id]. This patch fixed a bug in check_attach_btf_id(), where prog->aux->func[subprog_id] is used to access any subprogram which caused a segfault like below: [79162.619208] BUG: kernel NULL pointer dereference, address: 0000000000000000 ...... [79162.634255] Call Trace: [79162.634974] ? _cond_resched+0x15/0x30 [79162.635686] ? kmem_cache_alloc_trace+0x162/0x220 [79162.636398] ? selinux_bpf_prog_alloc+0x1f/0x60 [79162.637111] bpf_prog_load+0x3de/0x690 [79162.637809] __do_sys_bpf+0x105/0x1740 [79162.638488] do_syscall_64+0x5b/0x180 [79162.639147] entry_SYSCALL_64_after_hwframe+0x44/0xa9 ...... Fixes: 5b92a28aae4d ("bpf: Support attaching tracing BPF program to other BPF programs") Reported-by: Eelco Chaudron Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191205010606.177774-1-yhs@fb.com --- kernel/bpf/verifier.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0482e1c4a77..034ef81f935b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9636,7 +9636,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) ret = -EINVAL; goto out; } - addr = (long) tgt_prog->aux->func[subprog]->bpf_func; + if (subprog == 0) + addr = (long) tgt_prog->bpf_func; + else + addr = (long) tgt_prog->aux->func[subprog]->bpf_func; } else { addr = kallsyms_lookup_name(tname); if (!addr) { -- cgit v1.2.3-59-g8ed1b