diff options
Diffstat (limited to 'kernel')
82 files changed, 3873 insertions, 1557 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4cb4130ced32..c332eb9d4841 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ +obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index 87f31bf1f0a0..8c201f414226 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -880,7 +880,7 @@ main_queue: return 0; } -int audit_send_list(void *_dest) +int audit_send_list_thread(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; @@ -924,19 +924,30 @@ out_kfree_skb: return NULL; } +static void audit_free_reply(struct audit_reply *reply) +{ + if (!reply) + return; + + if (reply->skb) + kfree_skb(reply->skb); + if (reply->net) + put_net(reply->net); + kfree(reply); +} + static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct sock *sk = audit_get_sk(reply->net); audit_ctl_lock(); audit_ctl_unlock(); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(sk, reply->skb, reply->portid, 0); - put_net(reply->net); - kfree(reply); + netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0); + reply->skb = NULL; + audit_free_reply(reply); return 0; } @@ -950,35 +961,32 @@ static int audit_send_reply_thread(void *arg) * @payload: payload data * @size: payload size * - * Allocates an skb, builds the netlink message, and sends it to the port id. - * No failure notifications. + * Allocates a skb, builds the netlink message, and sends it to the port id. */ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - struct net *net = sock_net(NETLINK_CB(request_skb).sk); - struct sk_buff *skb; struct task_struct *tsk; - struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), - GFP_KERNEL); + struct audit_reply *reply; + reply = kzalloc(sizeof(*reply), GFP_KERNEL); if (!reply) return; - skb = audit_make_reply(seq, type, done, multi, payload, size); - if (!skb) - goto out; - - reply->net = get_net(net); + reply->skb = audit_make_reply(seq, type, done, multi, payload, size); + if (!reply->skb) + goto err; + reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); reply->portid = NETLINK_CB(request_skb).portid; - reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (!IS_ERR(tsk)) - return; - kfree_skb(skb); -out: - kfree(reply); + if (IS_ERR(tsk)) + goto err; + + return; + +err: + audit_free_reply(reply); } /* @@ -1525,20 +1533,60 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_unlock(); } +/* Log information about who is connecting to the audit multicast socket */ +static void audit_log_multicast(int group, const char *op, int err) +{ + const struct cred *cred; + struct tty_struct *tty; + char comm[sizeof(current->comm)]; + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER); + if (!ab) + return; + + cred = current_cred(); + tty = audit_get_tty(); + audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", + task_pid_nr(current), + from_kuid(&init_user_ns, cred->uid), + from_kuid(&init_user_ns, audit_get_loginuid(current)), + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(current)); + audit_put_tty(tty); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_d_path_exe(ab, current->mm); /* exe= */ + audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err); + audit_log_end(ab); +} + /* Run custom bind function on netlink socket group connect or bind requests. */ -static int audit_bind(struct net *net, int group) +static int audit_multicast_bind(struct net *net, int group) { + int err = 0; + if (!capable(CAP_AUDIT_READ)) - return -EPERM; + err = -EPERM; + audit_log_multicast(group, "connect", err); + return err; +} - return 0; +static void audit_multicast_unbind(struct net *net, int group) +{ + audit_log_multicast(group, "disconnect", 0); } static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, - .bind = audit_bind, + .bind = audit_multicast_bind, + .unbind = audit_multicast_unbind, .flags = NL_CFG_F_NONROOT_RECV, .groups = AUDIT_NLGRP_MAX, }; diff --git a/kernel/audit.h b/kernel/audit.h index 2eed4d231624..f0233dc40b17 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -229,7 +229,7 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *_dest); +int audit_send_list_thread(void *_dest); extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 026e34da4ace..a10e2997aa6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1161,11 +1161,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) { - u32 portid = NETLINK_CB(request_skb).portid; - struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; - int err = 0; /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1173,25 +1170,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + dest = kmalloc(sizeof(*dest), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(net); - dest->portid = portid; + dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk)); + dest->portid = NETLINK_CB(request_skb).portid; skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); - tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list"); if (IS_ERR(tsk)) { skb_queue_purge(&dest->q); + put_net(dest->net); kfree(dest); - err = PTR_ERR(tsk); + return PTR_ERR(tsk); } - return err; + return 0; } int audit_comparator(u32 left, u32 op, u32 right) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 814406a35db1..468a23390457 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -130,6 +130,17 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; +struct audit_nfcfgop_tab { + enum audit_nfcfgop op; + const char *s; +}; + +static const struct audit_nfcfgop_tab audit_nfcfgs[] = { + { AUDIT_XT_OP_REGISTER, "register" }, + { AUDIT_XT_OP_REPLACE, "replace" }, + { AUDIT_XT_OP_UNREGISTER, "unregister" }, +}; + static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; @@ -2542,6 +2553,26 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } +void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, + enum audit_nfcfgop op) +{ + struct audit_buffer *ab; + char comm[sizeof(current->comm)]; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_NETFILTER_CFG); + if (!ab) + return; + audit_log_format(ab, "table=%s family=%u entries=%u op=%s", + name, af, nentries, audit_nfcfgs[op].s); + + audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_task_context(ab); /* subj= */ + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); + audit_log_end(ab); +} +EXPORT_SYMBOL_GPL(__audit_log_nfcfg); + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index af52ca658c73..9df4cc9a2907 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -262,10 +262,10 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA_DIGEST_WORDS]; - u32 ws[SHA_WORKSPACE_WORDS]; + u32 digest[SHA1_DIGEST_WORDS]; + u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; @@ -277,7 +277,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha_init(digest); + sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation @@ -308,8 +308,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; - bsize = round_up(psize, SHA_MESSAGE_BYTES); - blocks = bsize / SHA_MESSAGE_BYTES; + bsize = round_up(psize, SHA1_BLOCK_SIZE); + blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); @@ -320,12 +320,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { - sha_transform(digest, todo, ws); - todo += SHA_MESSAGE_BYTES; + sha1_transform(digest, todo, ws); + todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; - for (i = 0; i < SHA_DIGEST_WORDS; i++) + for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e14a842d7e0d..4d530b1d5683 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> +#include <asm/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/bpf-netns.h> @@ -287,27 +288,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index b05f1dd58a62..812a61afd538 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/compat.c b/kernel/compat.c index 843dd17e6078..b8d2800bb4b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -199,7 +199,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_read_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -211,11 +211,11 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, } if (nr_compat_longs) unsafe_get_user(*mask, umask++, Efault); - user_access_end(); + user_read_access_end(); return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -228,7 +228,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!user_access_begin(umask, bitmap_size / 8)) + if (!user_write_access_begin(umask, bitmap_size / 8)) return -EFAULT; while (nr_compat_longs > 1) { @@ -239,10 +239,10 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } if (nr_compat_longs) unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); - user_access_end(); + user_write_access_end(); return 0; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 2371292f30b0..6ff2578ecf17 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3,6 +3,7 @@ * * This code is licenced under the GPL. */ +#include <linux/sched/mm.h> #include <linux/proc_fs.h> #include <linux/smp.h> #include <linux/init.h> @@ -432,7 +433,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) /* * On x86 it's required to boot all logical CPUs at least once so * that the init code can get a chance to set CR4.MCE on each - * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any + * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any * core will shutdown the machine. */ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); @@ -564,6 +565,21 @@ static int bringup_cpu(unsigned int cpu) return bringup_wait_for_ap(cpu); } +static int finish_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + struct mm_struct *mm = idle->active_mm; + + /* + * idle_task_exit() will have switched to &init_mm, now + * clean up any remaining active_mm state. + */ + if (mm != &init_mm) + idle->active_mm = &init_mm; + mmdrop(mm); + return 0; +} + /* * Hotplug state machine related functions */ @@ -1327,7 +1343,7 @@ void bringup_nonboot_cpus(unsigned int setup_max_cpus) #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int __freeze_secondary_cpus(int primary, bool suspend) +int freeze_secondary_cpus(int primary) { int cpu, error = 0; @@ -1352,7 +1368,7 @@ int __freeze_secondary_cpus(int primary, bool suspend) if (cpu == primary) continue; - if (suspend && pm_wakeup_pending()) { + if (pm_wakeup_pending()) { pr_info("Wakeup pending. Abort CPU freeze\n"); error = -EBUSY; break; @@ -1376,8 +1392,8 @@ int __freeze_secondary_cpus(int primary, bool suspend) /* * Make sure the CPUs won't be enabled by someone else. We need to do - * this even in case of failure as all disable_nonboot_cpus() users are - * supposed to do enable_nonboot_cpus() on the failure path. + * this even in case of failure as all freeze_secondary_cpus() users are + * supposed to do thaw_secondary_cpus() on the failure path. */ cpu_hotplug_disabled++; @@ -1385,15 +1401,15 @@ int __freeze_secondary_cpus(int primary, bool suspend) return error; } -void __weak arch_enable_nonboot_cpus_begin(void) +void __weak arch_thaw_secondary_cpus_begin(void) { } -void __weak arch_enable_nonboot_cpus_end(void) +void __weak arch_thaw_secondary_cpus_end(void) { } -void enable_nonboot_cpus(void) +void thaw_secondary_cpus(void) { int cpu, error; @@ -1405,7 +1421,7 @@ void enable_nonboot_cpus(void) pr_info("Enabling non-boot CPUs ...\n"); - arch_enable_nonboot_cpus_begin(); + arch_thaw_secondary_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); @@ -1418,7 +1434,7 @@ void enable_nonboot_cpus(void) pr_warn("Error taking CPU%d up: %d\n", cpu, error); } - arch_enable_nonboot_cpus_end(); + arch_thaw_secondary_cpus_end(); cpumask_clear(frozen_cpus); out: @@ -1549,7 +1565,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup.single = bringup_cpu, - .teardown.single = NULL, + .teardown.single = finish_cpu, .cant_stop = true, }, /* Final state before CPU kills itself */ diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 9c23ae074b40..92da32275af5 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -6,12 +6,6 @@ #include <linux/export.h> /* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; - -/* * stores the physical address of elf header of crash image * * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2b7c9b67931d..ef94e906f05a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -67,9 +67,7 @@ static int kgdb_break_asap; struct debuggerinfo_struct kgdb_info[NR_CPUS]; -/** - * kgdb_connected - Is a host GDB connected to us? - */ +/* kgdb_connected - Is a host GDB connected to us? */ int kgdb_connected; EXPORT_SYMBOL_GPL(kgdb_connected); @@ -532,6 +530,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) if (exception_level > 1) { dump_stack(); + kgdb_io_module_registered = false; panic("Recursive entry to debugger"); } @@ -668,6 +667,8 @@ return_normal: if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) goto kgdb_restore; + atomic_inc(&ignore_console_lock_warning); + /* Call the I/O driver's pre_exception routine */ if (dbg_io_ops->pre_exception) dbg_io_ops->pre_exception(); @@ -740,6 +741,8 @@ cpu_master_loop: if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); + atomic_dec(&ignore_console_lock_warning); + if (!kgdb_single_step) { raw_spin_unlock(&dbg_slave_lock); /* Wait till all the CPUs have quit from the debugger. */ @@ -946,6 +949,14 @@ void kgdb_panic(const char *msg) kgdb_breakpoint(); } +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + pr_crit("Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + void __weak kgdb_arch_late(void) { } @@ -956,6 +967,9 @@ void __init dbg_late_init(void) if (kgdb_io_module_registered) kgdb_arch_late(); kdb_init(KDB_INIT_FULL); + + if (kgdb_io_module_registered && kgdb_break_asap) + kgdb_initial_breakpoint(); } static int @@ -1051,14 +1065,6 @@ void kgdb_schedule_breakpoint(void) } EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); -static void kgdb_initial_breakpoint(void) -{ - kgdb_break_asap = 0; - - pr_crit("Waiting for connection from remote gdb...\n"); - kgdb_breakpoint(); -} - /** * kgdb_register_io_module - register KGDB IO module * @new_dbg_io_ops: the io ops vector @@ -1067,15 +1073,22 @@ static void kgdb_initial_breakpoint(void) */ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) { + struct kgdb_io *old_dbg_io_ops; int err; spin_lock(&kgdb_registration_lock); - if (dbg_io_ops) { - spin_unlock(&kgdb_registration_lock); + old_dbg_io_ops = dbg_io_ops; + if (old_dbg_io_ops) { + if (!old_dbg_io_ops->deinit) { + spin_unlock(&kgdb_registration_lock); - pr_err("Another I/O driver is already registered with KGDB\n"); - return -EBUSY; + pr_err("KGDB I/O driver %s can't replace %s.\n", + new_dbg_io_ops->name, old_dbg_io_ops->name); + return -EBUSY; + } + pr_info("Replacing I/O driver %s with %s\n", + old_dbg_io_ops->name, new_dbg_io_ops->name); } if (new_dbg_io_ops->init) { @@ -1090,12 +1103,18 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops) { + old_dbg_io_ops->deinit(); + return 0; + } + pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name); /* Arm KGDB now. */ kgdb_register_callbacks(); - if (kgdb_break_asap) + if (kgdb_break_asap && + (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))) kgdb_initial_breakpoint(); return 0; @@ -1125,6 +1144,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) spin_unlock(&kgdb_registration_lock); + if (old_dbg_io_ops->deinit) + old_dbg_io_ops->deinit(); + pr_info("Unregistered I/O driver %s, debugger disabled\n", old_dbg_io_ops->name); } @@ -1165,7 +1187,8 @@ static int __init opt_kgdb_wait(char *str) kgdb_break_asap = 1; kdb_init(KDB_INIT_EARLY); - if (kgdb_io_module_registered) + if (kgdb_io_module_registered && + IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)) kgdb_initial_breakpoint(); return 0; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 515379cbf209..ec190569f690 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -62,7 +62,7 @@ int kdb_grep_trailing; /* * Kernel debugger state flags */ -int kdb_flags; +unsigned int kdb_flags; /* * kdb_lock protects updates to kdb_initial_cpu. Used to @@ -418,8 +418,7 @@ int kdb_set(int argc, const char **argv) argv[2]); return 0; } - kdb_flags = (kdb_flags & - ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK)) | (debugflags << KDB_DEBUG_FLAG_SHIFT); return 0; @@ -1108,7 +1107,8 @@ static int handle_ctrl_cmd(char *cmd) switch (*cmd) { case CTRL_P: if (cmdptr != cmd_tail) - cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) % + KDB_CMD_HISTORY_COUNT; strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); return 1; case CTRL_N: @@ -2081,7 +2081,8 @@ static int kdb_env(int argc, const char **argv) } if (KDB_DEBUG(MASK)) - kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + kdb_printf("KDBDEBUG=0x%x\n", + (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT); return 0; } diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..914ff5a58dd5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index bdb1533ada81..334d48b16c36 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -16,7 +16,7 @@ struct callchain_cpus_entries { struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; + struct perf_callchain_entry *cpu_entries[]; }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; diff --git a/kernel/events/core.c b/kernel/events/core.c index 468139611e06..2e330f330303 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -95,11 +95,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -112,11 +112,16 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - if (!ret) - ret = data.ret; - } while (ret == -EAGAIN); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); + ret = !ret ? data.ret : -EAGAIN; + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } @@ -9402,7 +9407,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9462,7 +9467,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -11509,7 +11514,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index f16f66b6b655..fcbf5616a441 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -55,7 +55,7 @@ struct perf_buffer { void *aux_priv; struct perf_event_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[]; }; extern void rb_free(struct perf_buffer *rb); diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..c300253a7b8e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -228,8 +228,9 @@ repeat: goto repeat; } -void rcuwait_wake_up(struct rcuwait *w) +int rcuwait_wake_up(struct rcuwait *w) { + int ret = 0; struct task_struct *task; rcu_read_lock(); @@ -237,7 +238,7 @@ void rcuwait_wake_up(struct rcuwait *w) /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called - * rcuwait_trywake() in the first place. Pairs with set_current_state() + * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE @@ -249,8 +250,10 @@ void rcuwait_wake_up(struct rcuwait *w) task = rcu_dereference(w->task); if (task) - wake_up_process(task); + ret = wake_up_process(task); rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); @@ -708,8 +711,12 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - profile_task_exit(tsk); - kcov_task_exit(tsk); + /* + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ WARN_ON(blk_needs_flush_plug(tsk)); @@ -727,6 +734,16 @@ void __noreturn do_exit(long code) */ set_fs(USER_DS); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + profile_task_exit(tsk); + kcov_task_exit(tsk); + ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); @@ -744,13 +761,6 @@ void __noreturn do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); @@ -1558,7 +1568,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1567,10 +1577,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } @@ -1685,7 +1695,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!user_access_begin(infop, sizeof(*infop))) + if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); @@ -1694,10 +1704,10 @@ COMPAT_SYSCALL_DEFINE5(waitid, unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); - user_access_end(); + user_write_access_end(); return err; Efault: - user_access_end(); + user_write_access_end(); return -EFAULT; } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 48ed22774efa..be98e94cb3cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -94,6 +94,7 @@ #include <linux/thread_info.h> #include <linux/stackleak.h> #include <linux/kasan.h> +#include <linux/scs.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -456,6 +457,8 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { + scs_release(tsk); + #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, @@ -840,6 +843,8 @@ void __init fork_init(void) NULL, free_vm_stack_cache); #endif + scs_init(); + lockdep_init_task(&init_task); uprobes_init(); } @@ -899,6 +904,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (err) goto free_stack; + err = scs_prepare(tsk, node); + if (err) + goto free_stack; + #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -1683,6 +1692,11 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + p->trc_reader_nesting = 0; + p->trc_reader_special.s = 0; + INIT_LIST_HEAD(&p->trc_holdout_list); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } struct pid *pidfd_pid(const struct file *file) diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..b4b9f960b610 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 20d501af4f2e..d63c324895ea 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -72,6 +72,7 @@ config IRQ_DOMAIN config IRQ_SIM bool select IRQ_WORK + select IRQ_DOMAIN # Support for hierarchical irq domains config IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index b992f88c5613..48006608baf0 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,14 +1,31 @@ // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl> + * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com> */ -#include <linux/slab.h> -#include <linux/irq_sim.h> #include <linux/irq.h> +#include <linux/irq_sim.h> +#include <linux/irq_work.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +struct irq_sim_work_ctx { + struct irq_work work; + int irq_base; + unsigned int irq_count; + unsigned long *pending; + struct irq_domain *domain; +}; + +struct irq_sim_irq_ctx { + int irqnum; + bool enabled; + struct irq_sim_work_ctx *work_ctx; +}; struct irq_sim_devres { - struct irq_sim *sim; + struct irq_domain *domain; }; static void irq_sim_irqmask(struct irq_data *data) @@ -36,159 +53,205 @@ static int irq_sim_set_type(struct irq_data *data, unsigned int type) return 0; } +static int irq_sim_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) + *state = test_bit(hwirq, irq_ctx->work_ctx->pending); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int irq_sim_set_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool state) +{ + struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + switch (which) { + case IRQCHIP_STATE_PENDING: + if (irq_ctx->enabled) { + assign_bit(hwirq, irq_ctx->work_ctx->pending, state); + if (state) + irq_work_queue(&irq_ctx->work_ctx->work); + } + break; + default: + return -EINVAL; + } + + return 0; +} + static struct irq_chip irq_sim_irqchip = { - .name = "irq_sim", - .irq_mask = irq_sim_irqmask, - .irq_unmask = irq_sim_irqunmask, - .irq_set_type = irq_sim_set_type, + .name = "irq_sim", + .irq_mask = irq_sim_irqmask, + .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, + .irq_get_irqchip_state = irq_sim_get_irqchip_state, + .irq_set_irqchip_state = irq_sim_set_irqchip_state, }; static void irq_sim_handle_irq(struct irq_work *work) { struct irq_sim_work_ctx *work_ctx; unsigned int offset = 0; - struct irq_sim *sim; int irqnum; work_ctx = container_of(work, struct irq_sim_work_ctx, work); - sim = container_of(work_ctx, struct irq_sim, work_ctx); - while (!bitmap_empty(work_ctx->pending, sim->irq_count)) { + while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) { offset = find_next_bit(work_ctx->pending, - sim->irq_count, offset); + work_ctx->irq_count, offset); clear_bit(offset, work_ctx->pending); - irqnum = irq_sim_irqnum(sim, offset); + irqnum = irq_find_mapping(work_ctx->domain, offset); handle_simple_irq(irq_to_desc(irqnum)); } } +static int irq_sim_domain_map(struct irq_domain *domain, + unsigned int virq, irq_hw_number_t hw) +{ + struct irq_sim_work_ctx *work_ctx = domain->host_data; + struct irq_sim_irq_ctx *irq_ctx; + + irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL); + if (!irq_ctx) + return -ENOMEM; + + irq_set_chip(virq, &irq_sim_irqchip); + irq_set_chip_data(virq, irq_ctx); + irq_set_handler(virq, handle_simple_irq); + irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); + irq_ctx->work_ctx = work_ctx; + + return 0; +} + +static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq) +{ + struct irq_sim_irq_ctx *irq_ctx; + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + irq_ctx = irq_data_get_irq_chip_data(irqd); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(irqd); + kfree(irq_ctx); +} + +static const struct irq_domain_ops irq_sim_domain_ops = { + .map = irq_sim_domain_map, + .unmap = irq_sim_domain_unmap, +}; + /** - * irq_sim_init - Initialize the interrupt simulator: allocate a range of - * dummy interrupts. + * irq_domain_create_sim - Create a new interrupt simulator irq_domain and + * allocate a range of dummy interrupts. * - * @sim: The interrupt simulator object to initialize. - * @num_irqs: Number of interrupts to allocate + * @fnode: struct fwnode_handle to be associated with this domain. + * @num_irqs: Number of interrupts to allocate. * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) +struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, + unsigned int num_irqs) { - int i; + struct irq_sim_work_ctx *work_ctx; - sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL); - if (!sim->irqs) - return -ENOMEM; + work_ctx = kmalloc(sizeof(*work_ctx), GFP_KERNEL); + if (!work_ctx) + goto err_out; - sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); - if (sim->irq_base < 0) { - kfree(sim->irqs); - return sim->irq_base; - } + work_ctx->pending = bitmap_zalloc(num_irqs, GFP_KERNEL); + if (!work_ctx->pending) + goto err_free_work_ctx; - sim->work_ctx.pending = bitmap_zalloc(num_irqs, GFP_KERNEL); - if (!sim->work_ctx.pending) { - kfree(sim->irqs); - irq_free_descs(sim->irq_base, num_irqs); - return -ENOMEM; - } + work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs, + &irq_sim_domain_ops, + work_ctx); + if (!work_ctx->domain) + goto err_free_bitmap; - for (i = 0; i < num_irqs; i++) { - sim->irqs[i].irqnum = sim->irq_base + i; - sim->irqs[i].enabled = false; - irq_set_chip(sim->irq_base + i, &irq_sim_irqchip); - irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]); - irq_set_handler(sim->irq_base + i, &handle_simple_irq); - irq_modify_status(sim->irq_base + i, - IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE); - } + work_ctx->irq_count = num_irqs; + init_irq_work(&work_ctx->work, irq_sim_handle_irq); - init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); - sim->irq_count = num_irqs; + return work_ctx->domain; - return sim->irq_base; +err_free_bitmap: + bitmap_free(work_ctx->pending); +err_free_work_ctx: + kfree(work_ctx); +err_out: + return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL_GPL(irq_sim_init); +EXPORT_SYMBOL_GPL(irq_domain_create_sim); /** - * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt - * descriptors and allocated memory. + * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free + * the interrupt descriptors and allocated memory. * - * @sim: The interrupt simulator to tear down. + * @domain: The interrupt simulator domain to tear down. */ -void irq_sim_fini(struct irq_sim *sim) +void irq_domain_remove_sim(struct irq_domain *domain) { - irq_work_sync(&sim->work_ctx.work); - bitmap_free(sim->work_ctx.pending); - irq_free_descs(sim->irq_base, sim->irq_count); - kfree(sim->irqs); + struct irq_sim_work_ctx *work_ctx = domain->host_data; + + irq_work_sync(&work_ctx->work); + bitmap_free(work_ctx->pending); + kfree(work_ctx); + + irq_domain_remove(domain); } -EXPORT_SYMBOL_GPL(irq_sim_fini); +EXPORT_SYMBOL_GPL(irq_domain_remove_sim); -static void devm_irq_sim_release(struct device *dev, void *res) +static void devm_irq_domain_release_sim(struct device *dev, void *res) { struct irq_sim_devres *this = res; - irq_sim_fini(this->sim); + irq_domain_remove_sim(this->domain); } /** - * irq_sim_init - Initialize the interrupt simulator for a managed device. + * devm_irq_domain_create_sim - Create a new interrupt simulator for + * a managed device. * * @dev: Device to initialize the simulator object for. - * @sim: The interrupt simulator object to initialize. + * @fnode: struct fwnode_handle to be associated with this domain. * @num_irqs: Number of interrupts to allocate * - * On success: return the base of the allocated interrupt range. - * On failure: a negative errno. + * On success: return a new irq_domain object. + * On failure: a negative errno wrapped with ERR_PTR(). */ -int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, - unsigned int num_irqs) +struct irq_domain *devm_irq_domain_create_sim(struct device *dev, + struct fwnode_handle *fwnode, + unsigned int num_irqs) { struct irq_sim_devres *dr; - int rv; - dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL); + dr = devres_alloc(devm_irq_domain_release_sim, + sizeof(*dr), GFP_KERNEL); if (!dr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - rv = irq_sim_init(sim, num_irqs); - if (rv < 0) { + dr->domain = irq_domain_create_sim(fwnode, num_irqs); + if (IS_ERR(dr->domain)) { devres_free(dr); - return rv; + return dr->domain; } - dr->sim = sim; devres_add(dev, dr); - - return rv; -} -EXPORT_SYMBOL_GPL(devm_irq_sim_init); - -/** - * irq_sim_fire - Enqueue an interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt which should be fired. - */ -void irq_sim_fire(struct irq_sim *sim, unsigned int offset) -{ - if (sim->irqs[offset].enabled) { - set_bit(offset, sim->work_ctx.pending); - irq_work_queue(&sim->work_ctx.work); - } -} -EXPORT_SYMBOL_GPL(irq_sim_fire); - -/** - * irq_sim_irqnum - Get the allocated number of a dummy interrupt. - * - * @sim: The interrupt simulator object. - * @offset: Offset of the simulated interrupt for which to retrieve - * the number. - */ -int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset) -{ - return sim->irqs[offset].irqnum; + return dr->domain; } -EXPORT_SYMBOL_GPL(irq_sim_irqnum); +EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 35b8d97c3a1d..a4c2c915511d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -132,14 +132,13 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { - struct device_node *of_node = to_of_node(fwnode); struct irqchip_fwid *fwid; struct irq_domain *domain; static atomic_t unknown_domains; domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), - GFP_KERNEL, of_node_to_nid(of_node)); + GFP_KERNEL, of_node_to_nid(to_of_node(fwnode))); if (!domain) return NULL; @@ -162,30 +161,16 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } -#ifdef CONFIG_ACPI - } else if (is_acpi_device_node(fwnode)) { - struct acpi_buffer buf = { - .length = ACPI_ALLOCATE_BUFFER, - }; - acpi_handle handle; - - handle = acpi_device_handle(to_acpi_device_node(fwnode)); - if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { - domain->name = buf.pointer; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - - domain->fwnode = fwnode; -#endif - } else if (of_node) { + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || + is_software_node(fwnode)) { char *name; /* - * DT paths contain '/', which debugfs is legitimately + * fwnode paths contain '/', which debugfs is legitimately * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kasprintf(GFP_KERNEL, "%pOF", of_node); + name = kasprintf(GFP_KERNEL, "%pfw", fwnode); if (!name) { kfree(domain); return NULL; @@ -210,7 +195,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; } - of_node_get(of_node); + fwnode_handle_get(fwnode); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); @@ -259,7 +244,7 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); - of_node_put(irq_domain_get_of_node(domain)); + fwnode_handle_put(domain->fwnode); if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) kfree(domain->name); kfree(domain); @@ -1047,6 +1032,18 @@ int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, return virq; } +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); + #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy @@ -1248,18 +1245,6 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, EXPORT_SYMBOL(irq_domain_set_info); /** - * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data - * @irq_data: The pointer to irq_data - */ -void irq_domain_reset_irq_data(struct irq_data *irq_data) -{ - irq_data->hwirq = 0; - irq_data->chip = &no_irq_chip; - irq_data->chip_data = NULL; -} -EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); - -/** * irq_domain_free_irqs_common - Clear irq_data and free the parent * @domain: Interrupt domain to match * @virq: IRQ number to start with diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 453a8a0f4804..761911168438 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2619,6 +2619,8 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2696,6 +2698,8 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 48b5d1b6af4d..eca83965b631 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); /* * If the work is already pending, no need to raise the IPI. * The pairing atomic_fetch_andnot() in irq_work_run() makes sure @@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &work->llnode); } else { __irq_work_queue_local(work); } @@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void) return true; } +void irq_work_single(void *arg) +{ + struct irq_work *work = arg; + int flags; + + /* + * Clear the PENDING bit, after this point the @work + * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. + */ + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + + lockdep_irq_work_enter(work); + work->func(work); + lockdep_irq_work_exit(work); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); +} + static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; @@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) { - int flags; - /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. - */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); - - lockdep_irq_work_enter(work); - work->func(work); - lockdep_irq_work_exit(work); - /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. - */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); - } + llist_for_each_entry_safe(work, tmp, llnode, llnode) + irq_work_single(work); } /* diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ffbe03a45c16..0fbdee78266b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2179,6 +2179,24 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end) return 0; } +/* Remove all symbols in given area from kprobe blacklist */ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end) +{ + struct kprobe_blacklist_entry *ent, *n; + + list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) { + if (ent->start_addr < start || ent->start_addr >= end) + continue; + list_del(&ent->list); + kfree(ent); + } +} + +static void kprobe_remove_ksym_blacklist(unsigned long entry) +{ + kprobe_remove_area_blacklist(entry, entry + 1); +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; @@ -2211,10 +2229,62 @@ static int __init populate_kprobe_blacklist(unsigned long *start, /* Symbols in __kprobes_text are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); + if (ret) + return ret; + + /* Symbols in noinstr section are blacklisted */ + ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, + (unsigned long)__noinstr_text_end); return ret ? : arch_populate_kprobe_blacklist(); } +static void add_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_add_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_add_area_blacklist(start, end); + } +} + +static void remove_module_kprobe_blacklist(struct module *mod) +{ + unsigned long start, end; + int i; + + if (mod->kprobe_blacklist) { + for (i = 0; i < mod->num_kprobe_blacklist; i++) + kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]); + } + + start = (unsigned long)mod->kprobes_text_start; + if (start) { + end = start + mod->kprobes_text_size; + kprobe_remove_area_blacklist(start, end); + } + + start = (unsigned long)mod->noinstr_text_start; + if (start) { + end = start + mod->noinstr_text_size; + kprobe_remove_area_blacklist(start, end); + } +} + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2225,6 +2295,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) { + mutex_lock(&kprobe_mutex); + add_module_kprobe_blacklist(mod); + mutex_unlock(&kprobe_mutex); + } if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2255,6 +2330,8 @@ static int kprobes_module_callback(struct notifier_block *nb, kill_kprobe(p); } } + if (val == MODULE_STATE_GOING) + remove_module_kprobe_blacklist(mod); mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } @@ -2420,6 +2497,7 @@ static const struct file_operations debugfs_kprobes_operations = { /* kprobes/blacklist -- shows which functions can not be probed */ static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&kprobe_mutex); return seq_list_start(&kprobe_blacklist, *pos); } @@ -2446,10 +2524,15 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) return 0; } +static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v) +{ + mutex_unlock(&kprobe_mutex); +} + static const struct seq_operations kprobe_blacklist_seq_ops = { .start = kprobe_blacklist_seq_start, .next = kprobe_blacklist_seq_next, - .stop = kprobe_seq_stop, /* Reuse void function */ + .stop = kprobe_blacklist_seq_stop, .show = kprobe_blacklist_seq_show, }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ac10db66cc63..4c057dd8e93b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -393,25 +393,6 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } -/* - * Split the recrursion counter in two to readily detect 'off' vs recursion. - */ -#define LOCKDEP_RECURSION_BITS 16 -#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) -#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) - -void lockdep_off(void) -{ - current->lockdep_recursion += LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion -= LOCKDEP_OFF; -} -EXPORT_SYMBOL(lockdep_on); - static inline void lockdep_recursion_finish(void) { if (WARN_ON_ONCE(--current->lockdep_recursion)) @@ -489,7 +470,7 @@ struct lock_trace { struct hlist_node hash_entry; u32 hash; u32 nr_entries; - unsigned long entries[0] __aligned(sizeof(unsigned long)); + unsigned long entries[] __aligned(sizeof(unsigned long)); }; #define LOCK_TRACE_SIZE_IN_LONGS \ (sizeof(struct lock_trace) / sizeof(unsigned long)) @@ -3635,13 +3616,10 @@ mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) /* * Hardirqs will be enabled: */ -static void __trace_hardirqs_on_caller(unsigned long ip) +static void __trace_hardirqs_on_caller(void) { struct task_struct *curr = current; - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - /* * We are going to turn hardirqs on, so set the * usage bit for all held locks: @@ -3654,15 +3632,19 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); } -void lockdep_hardirqs_on(unsigned long ip) +/** + * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts + * @ip: Caller address + * + * Invoked before a possible transition to RCU idle from exit to user or + * guest mode. This ensures that all RCU operations are done before RCU + * stops watching. After the RCU transition lockdep_hardirqs_on() has to be + * invoked to set the final state. + */ +void lockdep_hardirqs_on_prepare(unsigned long ip) { if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -3698,20 +3680,62 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; + current->hardirq_chain_key = current->curr_chain_key; + current->lockdep_recursion++; - __trace_hardirqs_on_caller(ip); + __trace_hardirqs_on_caller(); lockdep_recursion_finish(); } -NOKPROBE_SYMBOL(lockdep_hardirqs_on); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare); + +void noinstr lockdep_hardirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || curr->lockdep_recursion)) + return; + + if (curr->hardirqs_enabled) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * Ensure the lock stack remained unchanged between + * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on(). + */ + DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key != + current->curr_chain_key); + + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL_GPL(lockdep_hardirqs_on); /* * Hardirqs were disabled: */ -void lockdep_hardirqs_off(unsigned long ip) +void noinstr lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - if (unlikely(!debug_locks || current->lockdep_recursion)) + if (unlikely(!debug_locks || curr->lockdep_recursion)) return; /* @@ -3729,10 +3753,11 @@ void lockdep_hardirqs_off(unsigned long ip) curr->hardirq_disable_ip = ip; curr->hardirq_disable_event = ++curr->irq_events; debug_atomic_inc(hardirqs_off_events); - } else + } else { debug_atomic_inc(redundant_hardirqs_off); + } } -NOKPROBE_SYMBOL(lockdep_hardirqs_off); +EXPORT_SYMBOL_GPL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -4408,8 +4433,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, dump_stack(); } -static int match_held_lock(const struct held_lock *hlock, - const struct lockdep_map *lock) +static noinstr int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; @@ -4696,7 +4721,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) return 0; } -static nokprobe_inline +static __always_inline int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; @@ -4956,7 +4981,7 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(const struct lockdep_map *lock, int read) +noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9f090d64f00..cfdd5b93264d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -141,7 +141,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * set up. */ #ifndef CONFIG_DEBUG_RT_MUTEXES -# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) @@ -202,7 +201,6 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #else -# define rt_mutex_cmpxchg_relaxed(l,c,n) (0) # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) diff --git a/kernel/module.c b/kernel/module.c index 8833e848b73c..be5413903d20 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2403,7 +2403,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(sname, ".init")) + || module_init_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2436,7 +2436,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(sname, ".init")) + || !module_init_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2771,6 +2771,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_init_section(const char *name) +{ + return strstarts(name, ".init"); +} + bool __weak module_exit_section(const char *name) { return strstarts(name, ".exit"); @@ -2949,8 +2954,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; @@ -3153,6 +3157,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) } #endif + mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1, + &mod->noinstr_text_size); + #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), @@ -3197,6 +3204,13 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ei_funcs), &mod->num_ei_funcs); #endif +#ifdef CONFIG_KPROBES + mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1, + &mod->kprobes_text_size); + mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist", + sizeof(unsigned long), + &mod->num_kprobe_blacklist); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ed9882108cd2..b03df67621d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,8 @@ #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> +#include <linux/fs_struct.h> +#include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -257,37 +259,296 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) +static int check_setns_flags(unsigned long flags) { - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; - struct file *file; - struct ns_common *ns; - int err; + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | + CLONE_NEWCGROUP))) + return -EINVAL; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); +#ifndef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + return -EINVAL; +#endif +#ifndef CONFIG_PID_NS + if (flags & CLONE_NEWPID) + return -EINVAL; +#endif +#ifndef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) + return -EINVAL; +#endif +#ifndef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + return -EINVAL; +#endif +#ifndef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) + return -EINVAL; +#endif +#ifndef CONFIG_NET_NS + if (flags & CLONE_NEWNET) + return -EINVAL; +#endif - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) - goto out; + return 0; +} + +static void put_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + + if (flags & CLONE_NEWUSER) + put_cred(nsset_cred(nsset)); + /* + * We only created a temporary copy if we attached to more than just + * the mount namespace. + */ + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) + free_fs_struct(nsset->fs); + if (nsset->nsproxy) + free_nsproxy(nsset->nsproxy); +} + +static int prepare_nsset(unsigned flags, struct nsset *nsset) +{ + struct task_struct *me = current; + + nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs); + if (IS_ERR(nsset->nsproxy)) + return PTR_ERR(nsset->nsproxy); - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); + if (flags & CLONE_NEWUSER) + nsset->cred = prepare_creds(); + else + nsset->cred = current_cred(); + if (!nsset->cred) goto out; + + /* Only create a temporary copy of fs_struct if we really need to. */ + if (flags == CLONE_NEWNS) { + nsset->fs = me->fs; + } else if (flags & CLONE_NEWNS) { + nsset->fs = copy_fs_struct(me->fs); + if (!nsset->fs) + goto out; } - err = ns->ops->install(new_nsproxy, ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; + nsset->flags = flags; + return 0; + +out: + put_nsset(nsset); + return -ENOMEM; +} + +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns) +{ + return ns->ops->install(nsset, ns); +} + +/* + * This is the inverse operation to unshare(). + * Ordering is equivalent to the standard ordering used everywhere else + * during unshare and process creation. The switch to the new set of + * namespaces occurs at the point of no return after installation of + * all requested namespaces was successful in commit_nsset(). + */ +static int validate_nsset(struct nsset *nsset, struct pid *pid) +{ + int ret = 0; + unsigned flags = nsset->flags; + struct user_namespace *user_ns = NULL; + struct pid_namespace *pid_ns = NULL; + struct nsproxy *nsp; + struct task_struct *tsk; + + /* Take a "snapshot" of the target task's namespaces. */ + rcu_read_lock(); + tsk = pid_task(pid, PIDTYPE_PID); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + return -EPERM; + } + + task_lock(tsk); + nsp = tsk->nsproxy; + if (nsp) + get_nsproxy(nsp); + task_unlock(tsk); + if (!nsp) { + rcu_read_unlock(); + return -ESRCH; + } + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + pid_ns = task_active_pid_ns(tsk); + if (unlikely(!pid_ns)) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_pid_ns(pid_ns); + } +#endif + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif + rcu_read_unlock(); + + /* + * Install requested namespaces. The caller will have + * verified earlier that the requested namespaces are + * supported on this kernel. We don't report errors here + * if a namespace is requested that isn't supported. + */ +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + ret = validate_ns(nsset, &user_ns->ns); + if (ret) + goto out; + } +#endif + + if (flags & CLONE_NEWNS) { + ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns)); + if (ret) + goto out; + } + +#ifdef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) { + ret = validate_ns(nsset, &nsp->uts_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) { + ret = validate_ns(nsset, &nsp->ipc_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + ret = validate_ns(nsset, &pid_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) { + ret = validate_ns(nsset, &nsp->cgroup_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_NET_NS + if (flags & CLONE_NEWNET) { + ret = validate_ns(nsset, &nsp->net_ns->ns); + if (ret) + goto out; + } +#endif + +out: + if (pid_ns) + put_pid_ns(pid_ns); + if (nsp) + put_nsproxy(nsp); + put_user_ns(user_ns); + + return ret; +} + +/* + * This is the point of no return. There are just a few namespaces + * that do some actual work here and it's sufficiently minimal that + * a separate ns_common operation seems unnecessary for now. + * Unshare is doing the same thing. If we'll end up needing to do + * more in a given namespace or a helper here is ultimately not + * exported anymore a simple commit handler for each namespace + * should be added to ns_common. + */ +static void commit_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + struct task_struct *me = current; + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + /* transfer ownership */ + commit_creds(nsset_cred(nsset)); + nsset->cred = NULL; + } +#endif + + /* We only need to commit if we have used a temporary fs_struct. */ + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) { + set_fs_root(me->fs, &nsset->fs->root); + set_fs_pwd(me->fs, &nsset->fs->pwd); } - switch_task_namespaces(tsk, new_nsproxy); - perf_event_namespaces(tsk); +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + exit_sem(me); +#endif + + /* transfer ownership */ + switch_task_namespaces(me, nsset->nsproxy); + nsset->nsproxy = NULL; +} + +SYSCALL_DEFINE2(setns, int, fd, int, flags) +{ + struct file *file; + struct ns_common *ns = NULL; + struct nsset nsset = {}; + int err = 0; + + file = fget(fd); + if (!file) + return -EBADF; + + if (proc_ns_file(file)) { + ns = get_proc_ns(file_inode(file)); + if (flags && (ns->ops->type != flags)) + err = -EINVAL; + flags = ns->ops->type; + } else if (!IS_ERR(pidfd_pid(file))) { + err = check_setns_flags(flags); + } else { + err = -EBADF; + } + if (err) + goto out; + + err = prepare_nsset(flags, &nsset); + if (err) + goto out; + + if (proc_ns_file(file)) + err = validate_ns(&nsset, ns); + else + err = validate_nsset(&nsset, file->private_data); + if (!err) { + commit_nsset(&nsset); + perf_event_namespaces(current); + } + put_nsset(&nsset); out: fput(file); return err; diff --git a/kernel/padata.c b/kernel/padata.c index a6afa12fb75e..aae789896616 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -703,7 +703,7 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -718,7 +718,7 @@ static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, node); + pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); if (!pinst_has_cpu(pinst, cpu)) return 0; @@ -734,8 +734,9 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, &pinst->node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, + &pinst->cpu_dead_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); #endif WARN_ON(!list_empty(&pinst->pslist)); @@ -939,9 +940,10 @@ static struct padata_instance *padata_alloc(const char *name, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, + &pinst->cpu_online_node); cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->node); + &pinst->cpu_dead_node); #endif put_online_cpus(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 3ccaba5f15c0..0e5ac162c3a8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c208566c844b..4d0e6e815a2b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -80,6 +80,18 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.rst>. +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + ---help--- + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index e7e47d9be1e5..5899260a8bef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30bd28d1d418..02ec716a4927 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -67,6 +67,18 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); @@ -704,7 +716,7 @@ int hibernate(void) lock_system_sleep(); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } @@ -775,7 +787,7 @@ int hibernate(void) Exit: __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); - atomic_inc(&snapshot_device_available); + hibernate_release(); Unlock: unlock_system_sleep(); pr_info("hibernation exit\n"); @@ -880,7 +892,7 @@ static int software_resume(void) goto Unlock; /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; swsusp_close(FMODE_READ); goto Unlock; @@ -911,7 +923,7 @@ static int software_resume(void) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); pr_info("resume failed (%d)\n", error); - atomic_inc(&snapshot_device_available); + hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&system_transition_mutex); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7cdc64dc2373..ba2094db6294 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -154,8 +154,8 @@ extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -/* If unset, the snapshot device cannot be open. */ -extern atomic_t snapshot_device_available; +extern bool hibernate_acquire(void); +extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7959449765d9..d5eedc2baa2a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,9 +35,13 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; + struct inode *bd_inode; } snapshot_state; -atomic_t snapshot_device_available = ATOMIC_INIT(1); +int is_hibernate_resume_dev(const struct inode *bd_inode) +{ + return hibernation_available() && snapshot_state.bd_inode == bd_inode; +} static int snapshot_open(struct inode *inode, struct file *filp) { @@ -49,13 +53,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) lock_system_sleep(); - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); + hibernate_release(); error = -ENOSYS; goto Unlock; } @@ -92,11 +96,12 @@ static int snapshot_open(struct inode *inode, struct file *filp) __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) - atomic_inc(&snapshot_device_available); + hibernate_release(); data->frozen = false; data->ready = false; data->platform_support = false; + data->bd_inode = NULL; Unlock: unlock_system_sleep(); @@ -112,6 +117,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; + data->bd_inode = NULL; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -122,7 +128,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); + hibernate_release(); unlock_system_sleep(); @@ -204,6 +210,7 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { + struct block_device *bdev; sector_t offset; dev_t swdev; @@ -234,9 +241,12 @@ static int snapshot_set_swap_area(struct snapshot_data *data, data->swap = -1; return -EINVAL; } - data->swap = swap_type_of(swdev, offset, NULL); + data->swap = swap_type_of(swdev, offset, &bdev); if (data->swap < 0) return -ENODEV; + + data->bd_inode = bdev->bd_inode; + bdput(bdev); return 0; } diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index 11f19c466af5..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -6,6 +6,7 @@ struct console_cmdline { char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ + bool user_specified; /* Specified by command line vs. platform */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE char *brl_options; /* Options for braille driver */ diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index b2b0f526f249..660f9a6bf73a 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,9 +6,11 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 + +#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 extern raw_spinlock_t logbuf_lock; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 471f649b5868..3132d6f860a8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,6 +280,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; +static bool has_preferred_console; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -974,6 +975,16 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) user->idx = log_next_idx; user->seq = log_next_seq; break; + case SEEK_CUR: + /* + * It isn't supported due to the record nature of this + * interface: _SET _DATA and _END point to very specific + * record positions, while _CUR would be more useful in case + * of a byte-based log. Because of that, return the default + * errno value for invalid seek operation. + */ + ret = -ESPIPE; + break; default: ret = -EINVAL; } @@ -2140,7 +2151,7 @@ asmlinkage __visible void early_printk(const char *fmt, ...) #endif static int __add_preferred_console(char *name, int idx, char *options, - char *brl_options) + char *brl_options, bool user_specified) { struct console_cmdline *c; int i; @@ -2155,6 +2166,8 @@ static int __add_preferred_console(char *name, int idx, char *options, if (strcmp(c->name, name) == 0 && c->index == idx) { if (!brl_options) preferred_console = i; + if (user_specified) + c->user_specified = true; return 0; } } @@ -2164,6 +2177,7 @@ static int __add_preferred_console(char *name, int idx, char *options, preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; + c->user_specified = user_specified; braille_set_options(c, brl_options); c->index = idx; @@ -2190,6 +2204,9 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; + if (str[0] == 0) + return 1; + if (_braille_console_setup(&str, &brl_options)) return 1; @@ -2218,7 +2235,7 @@ static int __init console_setup(char *str) idx = simple_strtoul(s, NULL, 10); *s = 0; - __add_preferred_console(buf, idx, options, brl_options); + __add_preferred_console(buf, idx, options, brl_options, true); console_set_on_cmdline = 1; return 1; } @@ -2239,7 +2256,7 @@ __setup("console=", console_setup); */ int add_preferred_console(char *name, int idx, char *options) { - return __add_preferred_console(name, idx, options, NULL); + return __add_preferred_console(name, idx, options, NULL, false); } bool console_suspend_enabled = true; @@ -2438,9 +2455,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); + len = snprintf(text, sizeof(text), + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; @@ -2652,6 +2669,63 @@ static int __init keep_bootcon_setup(char *str) early_param("keep_bootcon", keep_bootcon_setup); /* + * This is called by register_console() to try to match + * the newly registered console with any of the ones selected + * by either the command line or add_preferred_console() and + * setup/enable it. + * + * Care need to be taken with consoles that are statically + * enabled such as netconsole + */ +static int try_enable_new_console(struct console *newcon, bool user_specified) +{ + struct console_cmdline *c; + int i; + + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { + if (c->user_specified != user_specified) + continue; + if (!newcon->match || + newcon->match(newcon, c->name, c->index, c->options) != 0) { + /* default matching */ + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); + if (strcmp(c->name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != c->index) + continue; + if (newcon->index < 0) + newcon->index = c->index; + + if (_braille_register_console(newcon, c)) + return 0; + + if (newcon->setup && + newcon->setup(newcon, c->options) != 0) + return -EIO; + } + newcon->flags |= CON_ENABLED; + if (i == preferred_console) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } + return 0; + } + + /* + * Some consoles, such as pstore and netconsole, can be enabled even + * without matching. Accept the pre-enabled consoles only when match() + * and setup() had a change to be called. + */ + if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) + return 0; + + return -ENOENT; +} + +/* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the @@ -2672,11 +2746,9 @@ early_param("keep_bootcon", keep_bootcon_setup); */ void register_console(struct console *newcon) { - int i; unsigned long flags; struct console *bcon = NULL; - struct console_cmdline *c; - static bool has_preferred; + int err; for_each_console(bcon) { if (WARN(bcon == newcon, "console '%s%d' already registered\n", @@ -2701,15 +2773,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred || bcon || !console_drivers) - has_preferred = preferred_console >= 0; + if (!has_preferred_console || bcon || !console_drivers) + has_preferred_console = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred) { + if (!has_preferred_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2717,47 +2789,20 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred = true; + has_preferred_console = true; } } } - /* - * See if this console matches one we selected on - * the command line. - */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { - if (!newcon->match || - newcon->match(newcon, c->name, c->index, c->options) != 0) { - /* default matching */ - BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); - if (strcmp(c->name, newcon->name) != 0) - continue; - if (newcon->index >= 0 && - newcon->index != c->index) - continue; - if (newcon->index < 0) - newcon->index = c->index; - - if (_braille_register_console(newcon, c)) - return; - - if (newcon->setup && - newcon->setup(newcon, c->options) != 0) - break; - } + /* See if this console matches one we selected on the command line */ + err = try_enable_new_console(newcon, true); - newcon->flags |= CON_ENABLED; - if (i == preferred_console) { - newcon->flags |= CON_CONSDEV; - has_preferred = true; - } - break; - } + /* If not, try to match against the platform default(s) */ + if (err == -ENOENT) + err = try_enable_new_console(newcon, false); - if (!(newcon->flags & CON_ENABLED)) + /* printk() messages are not printed to the Braille console. */ + if (err || newcon->flags & CON_BRL) return; /* @@ -2779,6 +2824,8 @@ void register_console(struct console *newcon) console_drivers = newcon; if (newcon->next) newcon->next->flags &= ~CON_CONSDEV; + /* Ensure this flag is always set for the head of the list */ + newcon->flags |= CON_CONSDEV; } else { newcon->next = console_drivers->next; console_drivers->next = newcon; @@ -3144,6 +3191,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -3157,12 +3221,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ @@ -3360,7 +3431,7 @@ out: EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the interator (unlocked version) + * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and @@ -3378,7 +3449,7 @@ void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) } /** - * kmsg_dump_rewind - reset the interator + * kmsg_dump_rewind - reset the iterator * @dumper: registered kmsg dumper * * Reset the dumper's iterator so that kmsg_dump_get_line() and diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index d9a659a686f3..4242403316bb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -10,6 +10,7 @@ #include <linux/cpumask.h> #include <linux/irq_work.h> #include <linux/printk.h> +#include <linux/kprobes.h> #include "internal.h" @@ -293,14 +294,14 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void notrace printk_nmi_enter(void) +void noinstr printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } -void notrace printk_nmi_exit(void) +void noinstr printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); } /* diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1cc940fef17c..0ebe15a84985 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -70,13 +70,37 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config TASKS_RCU_GENERIC + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + select SRCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + config TASKS_RCU def_bool PREEMPTION - select SRCU help This option enables a task-based RCU implementation that uses only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + user-mode execution as quiescent states. Not for manual selection. + +config TASKS_RUDE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + only context switch (including preemption) and user-mode + execution as quiescent states. It forces IPIs and context + switches on all online CPUs, including idle ones, so use + with caution. + +config TASKS_TRACE_RCU + def_bool 0 + help + This option enables a task-based RCU implementation that uses + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the CPU + hotplug code paths. It can force IPIs on online CPUs, including + idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -210,4 +234,22 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4aa02eee8f6c..452feae8de20 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -29,6 +29,8 @@ config RCU_PERF_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -46,6 +48,8 @@ config RCU_TORTURE_TEST select TORTURE_TEST select SRCU select TASKS_RCU + select TASKS_RUDE_RCU + select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 00ddc92c5774..cf66a3ccd757 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -431,6 +431,7 @@ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); +void show_rcu_tasks_gp_kthreads(void); void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ @@ -441,6 +442,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR @@ -454,6 +457,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -471,6 +475,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) @@ -498,6 +503,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -507,6 +513,7 @@ static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..16dd1e6b7c09 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,6 +88,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -722,6 +723,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5453bd557f43..efb792e13fca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -20,7 +20,7 @@ #include <linux/err.h> #include <linux/spinlock.h> #include <linux/smp.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -45,12 +45,25 @@ #include <linux/sched/sysctl.h> #include <linux/oom.h> #include <linux/tick.h> +#include <linux/rcupdate_trace.h> #include "rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ @@ -102,6 +115,9 @@ torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -665,6 +681,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -674,7 +695,7 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .fqs = NULL, @@ -725,6 +746,72 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_rude_torture_deferred_free, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .call = call_rcu_tasks_rude, + .cb_barrier = rcu_barrier_tasks_rude, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks-rude" +}; + +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_tracing_torture_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -734,7 +821,7 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) static bool __maybe_unused torturing_tasks(void) { - return cur_ops == &tasks_ops; + return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops; } /* @@ -833,7 +920,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -843,7 +930,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -914,7 +1001,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } @@ -1147,6 +1234,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + unsigned long flags; int idxnew = -1; int idxold = *readstate; int statesnew = ~*readstate & newstate; @@ -1181,8 +1269,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) + if (statesold & RCUTORTURE_RDR_RCU) { + bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -1283,6 +1378,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(srcu_ctlp) || + rcu_read_lock_trace_held() || torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ @@ -1444,9 +1540,9 @@ rcu_torture_stats_print(void) atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1536,6 +1632,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1544,6 +1641,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1599,6 +1697,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1607,26 +1706,37 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); + cur_ops->readunlock(idx); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1636,7 +1746,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } @@ -1692,8 +1802,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { @@ -2400,7 +2510,8 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &trivial_ops, + &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, + &tasks_tracing_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..6d3ef700fb0e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; @@ -1268,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1277,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..ce23f6cc5043 --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,1193 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU_GENERIC + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(void); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(struct list_head *hop); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); + +/** + * Definition for a Tasks-RCU-like mechanism. + * @cbs_head: Head of callback list. + * @cbs_tail: Tail pointer for callback list. + * @cbs_wq: Wait queue allowning new callback to get kthread's attention. + * @cbs_lock: Lock protecting callback list. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. + * @n_gps: Number of grace periods completed since boot. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdout_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). + * @call_func: This flavor's call_rcu()-equivalent function. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. + */ +struct rcu_tasks { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + struct wait_queue_head cbs_wq; + raw_spinlock_t cbs_lock; + int gp_state; + unsigned long gp_jiffies; + unsigned long gp_start; + unsigned long n_gps; + unsigned long n_ipis; + unsigned long n_ipis_fails; + struct task_struct *kthread_ptr; + rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; + call_rcu_func_t call_func; + char *name; + char *kname; +}; + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_tail = &rt_name.cbs_head, \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .name = n, \ + .kname = #rt_name, \ +} + +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); + +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (HZ / 2) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); + +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; + +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) +{ + unsigned long flags; + bool needwake; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + needwake = !rtp->cbs_head; + WRITE_ONCE(*rtp->cbs_tail, rhp); + rtp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rtp->kthread_ptr)) + wake_up(&rtp->cbs_wq); +} + +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(rtp->call_func); +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_tasks *rtp = arg; + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_FLAG_RCU); + WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + smp_mb__after_spinlock(); // Order updates vs. GP. + list = rtp->cbs_head; + rtp->cbs_head = NULL; + rtp->cbs_tail = &rtp->cbs_head; + raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + wait_event_interruptible(rtp->cbs_wq, + READ_ONCE(rtp->cbs_head)); + if (!rtp->cbs_head) { + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); + schedule_timeout_interruptible(HZ/10); + } + continue; + } + + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rtp->gp_func(rtp); + rtp->n_gps++; + + /* Invoke the callbacks. */ + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + /* Paranoid sleep to keep this from entering a tight loop */ + schedule_timeout_uninterruptible(HZ/10); + + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + } +} + +/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) + return; + smp_mb(); /* Ensure others see full kthread. */ +} + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + rtp->kname, + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), + data_race(rtp->n_gps), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[!!data_race(rtp->cbs_head)], + s); +} + +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g, *t; + unsigned long lastreport; + LIST_HEAD(holdouts); + int fract; + + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); + rtp->pregp_func(); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); + rtp->postscan_func(&holdouts); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ + fract = 10; + + for (;;) { + bool firstreport; + bool needreport; + int rtst; + + if (list_empty(&holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + } + + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); +} + +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. + +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(void) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +/* Processing between scanning taskslist and draining the holdout list. */ +void rcu_tasks_postscan(struct list_head *hop) +{ + /* + * Wait for tasks that are in the process of exiting. This + * does only part of the job, ensuring that all tasks that were + * previously exiting reach the point where they have disabled + * preemption, allowing the later synchronize_rcu() to finish + * the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); +} + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !READ_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); + sched_show_task(t); +} + +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(struct rcu_tasks *rtp) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * cleaning up after the synchronize_srcu() above. + */ + synchronize_rcu(); +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} +core_initcall(rcu_spawn_tasks_kthread); + +static void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} + +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +{ + struct task_struct *t = current; + + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + preempt_enable(); + exit_tasks_rcu_finish_trace(t); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +static void show_rcu_tasks_classic_gp_kthread(void) { } +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of +// passing an empty function to schedule_on_each_cpu(). This approach +// provides an asynchronous call_rcu_tasks_rude() API and batching +// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// This sends IPIs far and wide and induces otherwise unnecessary context +// switches on all online CPUs, whether idle or not. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + rtp->n_ipis += cpumask_weight(cpu_online_mask); + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); + +/** + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +/** + * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_rude(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_rude(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} +core_initcall(rcu_spawn_tasks_rude_kthread); + +static void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} + +#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ +static void show_rcu_tasks_rude_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instruction, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. The grace-period code can send IPIs to +// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. +// It is necessary to scan the full tasklist, much as for Tasks RCU. There +// is a single callback queue guarded by a single lock, again, much as for +// Tasks RCU. If needed, these downsides can be at least partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +atomic_t trc_n_readers_need_end; // Number of waited-for readers. +DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +unsigned long n_heavy_reader_attempts; +unsigned long n_heavy_reader_updates; +unsigned long n_heavy_reader_ofl_updates; + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/* + * This irq_work handler allows rcu_read_unlock_trace() to be invoked + * while the scheduler locks are held. + */ +static void rcu_read_unlock_iw(struct irq_work *iwp) +{ + wake_up(&trc_wait); +} +static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); + +/* If we are the last reader, wake up the grace-period kthread. */ +void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) +{ + int nq = t->trc_reader_special.b.need_qs; + + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && + t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (nq) + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_nesting, nesting); + if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) + irq_work_queue(&rcu_tasks_trace_iw); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; // Already on holdout list, so will check later. + } + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + if (likely(!t->trc_reader_nesting)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + // Mark as checked after decrement to avoid false + // positives on the above WARN_ON_ONCE(). + WRITE_ONCE(t->trc_reader_checked, true); + goto reset_ipi; + } + WRITE_ONCE(t->trc_reader_checked, true); + + // Get here if the task is in a read-side critical section. Set + // its state so that it will awaken the grace-period kthread upon + // exit from that critical section. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static bool trc_inspect_reader(struct task_struct *t, void *arg) +{ + int cpu = task_cpu(t); + bool in_qs = false; + bool ofl = cpu_is_offline(cpu); + + if (task_curr(t)) { + WARN_ON_ONCE(ofl & !is_idle_task(t)); + + // If no chance of heavyweight readers, do it the hard way. + if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return false; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + n_heavy_reader_attempts++; + if (!ofl && // Check for "running" idle tasks on offline CPUs. + !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return false; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; + if (ofl) + n_heavy_reader_ofl_updates++; + in_qs = true; + } else { + in_qs = likely(!t->trc_reader_nesting); + } + + // Mark as checked. Because this is called from the grace-period + // kthread, also remove the task from the holdout list. + t->trc_reader_checked = true; + trc_del_holdout(t); + + if (in_qs) + return true; // Already in quiescent state, done!!! + + // The task is in a read-side critical section, so set up its + // state so that it will awaken the grace-period kthread upon exit + // from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + return true; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + t->trc_reader_checked = true; + trc_del_holdout(t); + WARN_ON_ONCE(t->trc_reader_nesting); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + atomic_inc(&trc_n_readers_need_end); + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; + if (smp_call_function_single(cpu, + trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + rcu_tasks_trace.n_ipis_fails++; + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = cpu; + if (atomic_dec_and_test(&trc_n_readers_need_end)) { + WARN_ON_ONCE(1); + wake_up(&trc_wait); + } + } + } +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // Allow for fast-acting IPIs. + atomic_set(&trc_n_readers_need_end, 1); + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, + struct list_head *hop) +{ + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + WRITE_ONCE(t->trc_reader_checked, false); + t->trc_ipi_to_cpu = -1; + trc_wait_for_one_reader(t, hop); +} + +/* + * Do intermediate processing between task and holdout scans and + * pick up the idle tasks. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) +{ + int cpu; + + for_each_possible_cpu(cpu) + rcu_tasks_trace_pertask(idle_task(cpu), hop); + + // Re-enable CPU hotplug now that the tasklist scan has completed. + cpus_read_unlock(); + + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + synchronize_rcu(); + // Any tasks that exit after this point will set ->trc_reader_checked. +} + +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). + cpu = task_cpu(t); + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".i"[is_idle_task(t)], + ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + t->trc_reader_nesting, + " N"[!!t->trc_reader_special.b.need_qs], + cpu); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *g, *t; + + // Disable CPU hotplug across the holdout list scan. + cpus_read_lock(); + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !READ_ONCE(t->trc_reader_checked)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (READ_ONCE(t->trc_reader_checked)) + trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + + if (needreport) { + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); + } +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) +{ + bool firstreport; + struct task_struct *g, *t; + LIST_HEAD(holdouts); + long ret; + + // Remove the safety count. + smp_mb__before_atomic(); // Order vs. earlier atomics + atomic_dec(&trc_n_readers_need_end); + smp_mb__after_atomic(); // Order vs. later atomics + + // Wait for readers. + set_tasks_gp_state(rtp, RTGS_WAIT_READERS); + for (;;) { + ret = wait_event_idle_exclusive_timeout( + trc_wait, + atomic_read(&trc_n_readers_need_end) == 0, + READ_ONCE(rcu_task_stall_timeout)); + if (ret) + break; // Count reached zero. + // Stall warning time, so make a list of the offenders. + for_each_process_thread(g, t) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) + trc_add_holdout(t, &holdouts); + firstreport = true; + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) + if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + show_stalled_task_trace(t, &firstreport); + trc_del_holdout(t); + } + if (firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); + show_stalled_ipi_trace(); + pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); + } + smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. +} + +/* Report any needed quiescent state for this exiting task. */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + WRITE_ONCE(t->trc_reader_checked, true); + WARN_ON_ONCE(t->trc_reader_nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); + if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + rcu_read_unlock_trace_special(t, 0); +} + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_trace() + * assumes that the read-side critical sections end at context switch, + * cond_resched_rcu_qs(), or transition to usermode execution. As such, + * there are no read-side primitives analogous to rcu_read_lock() and + * rcu_read_unlock() because this primitive is intended to determine + * that all tasks have passed through a safe state, not so much for + * data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, + * anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks_trace(); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} +core_initcall(rcu_spawn_tasks_trace_kthread); + +static void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[64]; + + sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + data_race(n_heavy_reader_ofl_updates), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9a49cd6065a..c716eadc7617 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -67,6 +67,19 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Data structures. */ /* @@ -75,9 +88,6 @@ */ #define RCU_DYNTICK_CTRL_MASK 0x1 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, @@ -100,7 +110,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; @@ -225,9 +235,11 @@ void rcu_softirq_qs(void) /* * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. */ -static void rcu_dynticks_eqs_enter(void) +static noinstr void rcu_dynticks_eqs_enter(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -237,8 +249,9 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); /* Better not have special action (TLB flush) pending! */ @@ -248,9 +261,10 @@ static void rcu_dynticks_eqs_enter(void) /* * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. */ -static void rcu_dynticks_eqs_exit(void) +static noinstr void rcu_dynticks_eqs_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; @@ -261,13 +275,13 @@ static void rcu_dynticks_eqs_exit(void) * critical section. */ seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ - /* Prefer duplicate flushes to losing a flush. */ - rcu_eqs_special_exit(); } } @@ -295,7 +309,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -static bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -333,6 +347,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) } /* + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. + */ +bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int snap; + + // If not quiescent, force back to earlier extended quiescent state. + snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | + RCU_DYNTICK_CTRL_CTR); + + smp_rmb(); // Order ->dynticks and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and ->dynticks re-read. + + // If still in the same extended quiescent state, we are good! + return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); +} + +/* * Set the special (bottom) bit of the specified CPU so that it * will take special action (such as flushing its TLB) on the * next exit from an extended quiescent state. Returns true if @@ -382,16 +418,23 @@ void rcu_momentary_dyntick_idle(void) EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); /** - * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * * If the current CPU is idle and running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * interrupt, or directly, from idle, return true. + * + * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { - /* Called only from within the scheduling-clock interrupt */ - lockdep_assert_in_irq(); + long nesting; + + /* + * Usually called from the tick; but also used from smp_function_call() + * for expedited grace periods. This latter can result in running from + * the idle task, instead of an actual IPI. + */ + lockdep_assert_irqs_disabled(); /* Check for counter underflows */ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, @@ -400,9 +443,15 @@ static int rcu_is_cpu_rrupt_from_idle(void) "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + if (nesting > 1) return false; + /* + * If we're not in an interrupt, we must be in the idle task! + */ + WARN_ON_ONCE(!nesting && !is_idle_task(current)); + /* Does CPU appear to be idle from an RCU standpoint? */ return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } @@ -562,7 +611,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter(bool user) +static noinstr void rcu_eqs_enter(bool user) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -571,19 +620,24 @@ static void rcu_eqs_enter(bool user) WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdp->dynticks_nesting == 0); if (rdp->dynticks_nesting != 1) { + // RCU will still be watching, so just do accounting and leave. rdp->dynticks_nesting--; return; } lockdep_assert_irqs_disabled(); + instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. rcu_dynticks_task_enter(); } @@ -616,23 +670,25 @@ void rcu_idle_enter(void) * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_enter(void) +noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ -/* +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * * If we are returning from the outermost NMI handler that interrupted an * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit_common(), be sure to test + * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_exit_common(bool irq) +noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -649,38 +705,33 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { + instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); + instrumentation_end(); return; } + instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (irq) + if (!in_nmi()) rcu_prepare_for_idle(); + instrumentation_end(); + // RCU is watching here ... rcu_dynticks_eqs_enter(); + // ... but is no longer watching here. - if (irq) + if (!in_nmi()) rcu_dynticks_task_enter(); } /** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_exit(void) -{ - rcu_nmi_exit_common(false); -} - -/** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * * Exit from an interrupt handler, which might possibly result in entering @@ -699,12 +750,52 @@ void rcu_nmi_exit(void) * If you add or remove a call to rcu_irq_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_exit(void) +void noinstr rcu_irq_exit(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nmi_exit(); +} + +/** + * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq + * towards in kernel preemption + * + * Same as rcu_irq_exit() but has a sanity check that scheduling is safe + * from RCU point of view. Invoked from return from interrupt before kernel + * preemption. + */ +void rcu_irq_exit_preempt(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_exit_common(true); + rcu_nmi_exit(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); } +#ifdef CONFIG_PROVE_RCU +/** + * rcu_irq_exit_check_preempt - Validate that scheduling is possible + */ +void rcu_irq_exit_check_preempt(void) +{ + lockdep_assert_irqs_disabled(); + + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + "RCU dynticks_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + DYNTICK_IRQ_NONIDLE, + "Bad RCU dynticks_nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "RCU in extended quiescent state!"); +} +#endif /* #ifdef CONFIG_PROVE_RCU */ + /* * Wrapper for rcu_irq_exit() where interrupts are enabled. * @@ -728,7 +819,7 @@ void rcu_irq_exit_irqson(void) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void rcu_eqs_exit(bool user) +static void noinstr rcu_eqs_exit(bool user) { struct rcu_data *rdp; long oldval; @@ -738,17 +829,22 @@ static void rcu_eqs_exit(bool user) oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { + // RCU was already watching, so just do accounting and leave. rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. + instrumentation_begin(); rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); } /** @@ -779,14 +875,75 @@ void rcu_idle_exit(void) * If you add or remove a call to rcu_user_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_exit(void) +void noinstr rcu_user_exit(void) { rcu_eqs_exit(1); } + +/** + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. + * + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. + * + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. + * + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. + */ +void __rcu_irq_enter_check_tick(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + // Enabling the tick is unsafe in NMI handlers. + if (WARN_ON_ONCE(in_nmi())) + return; + + RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); + + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; + } + + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); +} #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter_common - inform RCU of entry to NMI context + * rcu_nmi_enter - inform RCU of entry to NMI context * @irq: Is this call from rcu_irq_enter? * * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and @@ -795,10 +952,10 @@ void rcu_user_exit(void) * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter_common(), be sure to test + * If you add or remove a call to rcu_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -static __always_inline void rcu_nmi_enter_common(bool irq) +noinstr void rcu_nmi_enter(void) { long incby = 2; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -816,45 +973,33 @@ static __always_inline void rcu_nmi_enter_common(bool irq) */ if (rcu_dynticks_curr_cpu_in_eqs()) { - if (irq) + if (!in_nmi()) rcu_dynticks_task_exit(); + // RCU is not watching here ... rcu_dynticks_eqs_exit(); + // ... but is watching here. - if (irq) + if (!in_nmi()) rcu_cleanup_after_idle(); incby = 1; - } else if (irq && tick_nohz_full_cpu(rdp->cpu) && - rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && - READ_ONCE(rdp->rcu_urgent_qs) && - !READ_ONCE(rdp->rcu_forced_tick)) { - raw_spin_lock_rcu_node(rdp->mynode); - // Recheck under lock. - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { - WRITE_ONCE(rdp->rcu_forced_tick, true); - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); - } - raw_spin_unlock_rcu_node(rdp->mynode); + } else if (!in_nmi()) { + instrumentation_begin(); + rcu_irq_enter_check_tick(); + instrumentation_end(); } + instrumentation_begin(); trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); } /** - * rcu_nmi_enter - inform RCU of entry to NMI context - */ -void rcu_nmi_enter(void) -{ - rcu_nmi_enter_common(false); -} -NOKPROBE_SYMBOL(rcu_nmi_enter); - -/** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * * Enter an interrupt handler, which might possibly result in exiting @@ -876,10 +1021,10 @@ NOKPROBE_SYMBOL(rcu_nmi_enter); * If you add or remove a call to rcu_irq_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_enter(void) +noinstr void rcu_irq_enter(void) { lockdep_assert_irqs_disabled(); - rcu_nmi_enter_common(true); + rcu_nmi_enter(); } /* @@ -913,6 +1058,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } +noinstr bool __rcu_is_watching(void) +{ + return !rcu_dynticks_curr_cpu_in_eqs(); +} + /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -921,7 +1071,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. */ -bool notrace rcu_is_watching(void) +bool rcu_is_watching(void) { bool ret; @@ -973,12 +1123,12 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi() || !rcu_scheduler_fully_active) return true; - preempt_disable(); + preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ret = true; - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); @@ -1217,7 +1367,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1473,6 +1623,31 @@ static void rcu_gp_slow(int delay) schedule_timeout_uninterruptible(delay); } +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_uninterruptible(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1506,6 +1681,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1611,12 +1787,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1654,13 +1834,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1673,6 +1855,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1680,12 +1863,16 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1705,6 +1892,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } @@ -1781,6 +1969,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); @@ -1821,6 +2010,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) @@ -2811,6 +3001,8 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; + // Number of objects for which GP not started + int count; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); @@ -2924,6 +3116,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } + WRITE_ONCE(krcp->count, 0); + /* * One work is per one batch, so there are two "free channels", * "bhead_free" and "head_free" the batch can handle. It can be @@ -3060,6 +3254,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } + WRITE_ONCE(krcp->count, krcp->count + 1); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3074,6 +3270,56 @@ unlock_return: } EXPORT_SYMBOL_GPL(kfree_call_rcu); +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += READ_ONCE(krcp->count); + } + + return count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -3599,6 +3845,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -3994,6 +4241,8 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9dc2ec021da5..43991a40b084 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -359,6 +359,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ @@ -454,6 +455,8 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +static void rcu_dynticks_task_trace_enter(void); +static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1a617b9dffb0..72952edad1e4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -639,6 +639,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) */ static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -649,7 +650,7 @@ static void rcu_exp_handler(void *unused) * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!rcu_preempt_depth()) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); @@ -673,7 +674,7 @@ static void rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (rcu_preempt_depth() > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->exp_deferred_qs = true; @@ -683,30 +684,8 @@ static void rcu_exp_handler(void *unused) return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_deferred_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->exp_deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + // Finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); } /* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ @@ -721,17 +700,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 097635c41135..352223664ebd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -331,6 +331,7 @@ void rcu_note_context_switch(bool preempt) rcu_qs(); if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -345,9 +346,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) static void rcu_preempt_read_enter(void) @@ -355,9 +354,9 @@ static void rcu_preempt_read_enter(void) current->rcu_read_lock_nesting++; } -static void rcu_preempt_read_exit(void) +static int rcu_preempt_read_exit(void) { - current->rcu_read_lock_nesting--; + return --current->rcu_read_lock_nesting; } static void rcu_preempt_depth_set(int val) @@ -390,21 +389,15 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (rcu_preempt_depth() != 1) { - rcu_preempt_read_exit(); - } else { + if (rcu_preempt_read_exit() == 0) { barrier(); /* critical section before exit code. */ - rcu_preempt_depth_set(-RCU_NEST_BIAS); - barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - rcu_preempt_depth_set(0); } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -500,12 +493,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -556,7 +549,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && - rcu_preempt_depth() <= 0; + rcu_preempt_depth() == 0; } /* @@ -569,16 +562,11 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = rcu_preempt_depth() >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS); local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS); } /* @@ -615,19 +603,18 @@ static void rcu_read_unlock_special(struct task_struct *t) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - tick_nohz_full_cpu(rdp->cpu); + exp = (t->rcu_blocked_node && + READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)); // Need to defer quiescent state until everything is enabled. - if (irqs_were_disabled && use_softirq && - (in_interrupt() || - (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { - // Using softirq, safe to awaken, and we get - // no help from enabling irqs, unlike bh/preempt. + if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is an expedited GP. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting or NO_HZ_FULL, slow is OK. + // Also if no expediting, slow is OK. + // Plus nohz_full CPUs eventually get tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && @@ -640,7 +627,6 @@ static void rcu_read_unlock_special(struct task_struct *t) irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } - t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } @@ -699,7 +685,7 @@ static void rcu_flavor_sched_clock_irq(int user) } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!rcu_preempt_depth()) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } @@ -760,8 +746,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -854,8 +840,7 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_tasks_qs(current, preempt); out: trace_rcu_utilization(TPS("End context switch")); } @@ -1036,7 +1021,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); @@ -1079,9 +1065,9 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); @@ -2536,7 +2522,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; @@ -2553,7 +2539,7 @@ static void rcu_bind_gp_kthread(void) } /* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) +static void noinstr rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -2561,9 +2547,27 @@ static void rcu_dynticks_task_enter(void) } /* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) +static void noinstr rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_RCU_TRACE + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +} diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 119ed6afd20f..ae76bd329582 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } @@ -192,14 +224,40 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static bool check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_node *rnp; + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rnp = t->rcu_blocked_node; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return true; +} + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static int rcu_print_task_stall(struct rcu_node *rnp) { - struct task_struct *t; int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -208,7 +266,15 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); + if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); ndetected++; } pr_cont("\n"); @@ -299,6 +365,16 @@ static const char *gp_state_getname(short gs) return gp_state_names[gs]; } +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -313,6 +389,7 @@ static const char *gp_state_getname(short gs) static void print_cpu_stall_info(int cpu) { unsigned long delta; + bool falsepositive; char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; @@ -333,7 +410,9 @@ static void print_cpu_stall_info(int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -345,8 +424,9 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_snap(rdp) & 0xfff, rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz, + falsepositive ? " (false positive?)" : ""); } /* Complain about starvation of grace-period kthread. */ @@ -355,15 +435,15 @@ static void rcu_check_gp_kthread_starvation(void) struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { + if (rcu_is_gp_kthread_starving(&j)) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - READ_ONCE(rcu_state.gp_flags), + data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); wake_up_process(gpk); @@ -371,7 +451,7 @@ static void rcu_check_gp_kthread_starvation(void) } } -static void print_other_cpu_stall(unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -408,7 +488,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + smp_processor_id(), (long)(jiffies - gps), (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { rcu_dump_cpu_stacks(); @@ -421,13 +501,11 @@ static void print_other_cpu_stall(unsigned long gp_seq) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); + gpa = data_race(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), + data_race(jiffies_till_next_fqs), rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ @@ -442,7 +520,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(void) +static void print_cpu_stall(unsigned long gps) { int cpu; unsigned long flags; @@ -467,7 +545,7 @@ static void print_cpu_stall(void) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, + jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); rcu_check_gp_kthread_starvation(); @@ -546,7 +624,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); + print_cpu_stall(gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); @@ -555,7 +633,7 @@ static void check_cpu_stall(struct rcu_data *rdp) cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); + print_other_cpu_stall(gs2, gps); if (rcu_cpu_stall_ftrace_dump) rcu_ftrace_dump(DUMP_ALL); } @@ -581,23 +659,23 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - READ_ONCE(rcu_state.gp_activity); - jr = j - READ_ONCE(rcu_state.gp_req_activity); - jw = j - READ_ONCE(rcu_state.gp_wake_time); + ja = j - data_race(rcu_state.gp_activity); + jr = j - data_race(rcu_state.gp_req_activity); + jw = j - data_race(rcu_state.gp_wake_time); pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, t ? t->state : 0x1ffffL, - ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), - (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rcu_get_root()->gp_seq_needed), - READ_ONCE(rcu_state.gp_flags)); + ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), + (long)data_race(rcu_state.gp_seq), + (long)data_race(rcu_get_root()->gp_seq_needed), + data_race(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", - rnp->grplo, rnp->grphi, (long)READ_ONCE(rnp->gp_seq), - (long)READ_ONCE(rnp->gp_seq_needed)); + rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq), + (long)data_race(rnp->gp_seq_needed)); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -607,7 +685,7 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)READ_ONCE(rdp->gp_seq_needed)); + cpu, (long)data_race(rdp->gp_seq_needed)); } } for_each_possible_cpu(cpu) { @@ -615,7 +693,7 @@ void show_rcu_gp_kthreads(void) if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } - /* sched_show_task(rcu_state.gp_kthread); */ + show_rcu_tasks_gp_kthreads(); } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 28a8bdc5072f..84843adfd939 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -41,6 +41,7 @@ #include <linux/sched/isolation.h> #include <linux/kprobes.h> #include <linux/slab.h> +#include <linux/irq_work.h> #define CREATE_TRACE_POINTS @@ -51,6 +52,19 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); @@ -63,12 +77,12 @@ module_param(rcu_normal_after_boot, int, 0); * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? * @ret: Best guess answer if lockdep cannot be relied on * - * Returns true if lockdep must be ignored, in which case *ret contains + * Returns true if lockdep must be ignored, in which case ``*ret`` contains * the best guess described below. Otherwise returns false, in which - * case *ret tells the caller nothing and the caller should instead + * case ``*ret`` tells the caller nothing and the caller should instead * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -82,7 +96,7 @@ module_param(rcu_normal_after_boot, int, 0); * * Note that if the CPU is in the idle loop from an RCU point of view (ie: * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) - * then rcu_read_lock_held() sets *ret to false even if the CPU did an + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, * so such a CPU is effectively never in an RCU read-side critical section @@ -98,15 +112,15 @@ module_param(rcu_normal_after_boot, int, 0); static bool rcu_read_lock_held_common(bool *ret) { if (!debug_lockdep_rcu_enabled()) { - *ret = 1; + *ret = true; return true; } if (!rcu_is_watching()) { - *ret = 0; + *ret = false; return true; } if (!rcu_lockdep_current_cpu_online()) { - *ret = 0; + *ret = false; return true; } return false; @@ -270,13 +284,12 @@ struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); EXPORT_SYMBOL_GPL(rcu_callback_map); -int notrace debug_lockdep_rcu_enabled(void) +noinstr int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); -NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? @@ -501,370 +514,6 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - WRITE_ONCE(*rcu_tasks_cbs_tail, rhp); - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); - -/** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - READ_ONCE(rcu_tasks_cbs_head)); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. - */ -static void __init rcu_tasks_bootup_oddness(void) -{ -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ -} - -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_PROVE_RCU /* @@ -935,6 +584,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..491f1347bf43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..90c7a002436d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) @@ -1177,10 +1177,9 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = relay_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .release = relay_pipe_buf_release, + .try_steal = generic_pipe_buf_try_steal, + .get = generic_pipe_buf_get, }; static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e89a042a48f..8298b2c240ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -11,6 +11,7 @@ #include <linux/nospec.h> #include <linux/kcov.h> +#include <linux/scs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -20,6 +21,7 @@ #include "../smpboot.h" #include "pelt.h" +#include "smp.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -219,6 +221,13 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } +static inline void +rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -314,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } @@ -632,29 +639,23 @@ void wake_up_nohz_cpu(int cpu) wake_up_idle_cpu(cpu); } -static inline bool got_nohz_idle_kick(void) +static void nohz_csd_func(void *info) { - int cpu = smp_processor_id(); - - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle_cpu(cpu) && !need_resched()) - return true; + struct rq *rq = info; + int cpu = cpu_of(rq); + unsigned int flags; /* - * We can't run Idle Load Balance on this CPU for this time so we - * cancel it and clear NOHZ_BALANCE_KICK + * Release the rq::nohz_csd. */ - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); - return false; -} - -#else /* CONFIG_NO_HZ_COMMON */ + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + WARN_ON(!(flags & NOHZ_KICK_MASK)); -static inline bool got_nohz_idle_kick(void) -{ - return false; + rq->idle_balance = idle_cpu(cpu); + if (rq->idle_balance && !need_resched()) { + rq->nohz_idle_balance = flags; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1538,7 +1539,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_from_idle(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -2272,16 +2273,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p, *t; struct rq_flags rf; if (!llist) return; + /* + * rq::ttwu_pending racy indication of out-standing wakeups. + * Races such that false-negatives are possible, since they + * are shorter lived that false-positives would be. + */ + WRITE_ONCE(rq->ttwu_pending, 0); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -2291,56 +2299,30 @@ void sched_ttwu_pending(void) rq_unlock_irqrestore(rq, &rf); } -void scheduler_ipi(void) +void send_call_function_single_ipi(int cpu) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); + struct rq *rq = cpu_rq(cpu); - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); + if (!set_nr_if_polling(rq->idle)) + arch_send_call_function_single_ipi(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); - else - trace_sched_wake_idle_without_ipi(cpu); - } + WRITE_ONCE(rq->ttwu_pending, 1); + __smp_call_single_queue(cpu, &p->wake_entry); } void wake_up_if_idle(int cpu) @@ -2371,6 +2353,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static inline bool ttwu_queue_cond(int cpu, int wake_flags) +{ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + /* + * If the task is descheduling and the only running task on the + * CPU then use the wakelist to offload the task activation to + * the soon-to-be-idle CPU as the current CPU is likely busy. + * nr_running is checked to avoid unnecessary task stacking. + */ + if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_wakelist(p, cpu, wake_flags); + return true; + } + + return false; +} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2379,11 +2393,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq_flags rf; #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; - } #endif rq_lock(rq, &rf); @@ -2560,12 +2571,22 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto unlock; + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + #ifdef CONFIG_SMP + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2589,6 +2610,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + */ + if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + goto unlock; + + /* + * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * * Pairs with the smp_store_release() in finish_task(). @@ -2598,28 +2629,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2634,6 +2649,52 @@ out: } /** + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked. + * @func: Function to invoke. + * @arg: Argument to function. + * + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. + */ +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +{ + bool ret = false; + struct rq_flags rf; + struct rq *rq; + + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq(&p->pi_lock); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); + } + } + raw_spin_unlock_irq(&p->pi_lock); + return ret; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2701,6 +2762,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry_type = CSD_TYPE_TTWU; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -3876,6 +3940,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -3898,6 +3965,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); +} + /* * Pick up the highest-prio task: */ @@ -3931,22 +4020,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: -#ifdef CONFIG_SMP - /* - * We must do the balancing pass before put_next_task(), such - * that when we release the rq->lock the task is in the same - * state as before we took rq->lock. - * - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -#endif - - put_prev_task(rq, prev); + put_prev_task_balance(rq, prev, rf); for_each_class(class) { p = class->pick_next_task(rq); @@ -4636,7 +4710,7 @@ int idle_cpu(int cpu) return 0; #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -6039,6 +6113,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; + scs_task_reset(idle); kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP @@ -6189,13 +6264,14 @@ void idle_task_exit(void) struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); + BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { switch_mm(mm, &init_mm, current); - current->active_mm = &init_mm; finish_arch_post_lock_switch(); } - mmdrop(mm); + + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } /* @@ -6485,7 +6561,6 @@ int sched_cpu_dying(unsigned int cpu) struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); @@ -6588,6 +6663,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6640,7 +6717,6 @@ void __init sched_init(void) init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6662,7 +6738,6 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -6690,6 +6765,8 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); @@ -7384,6 +7461,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -7412,6 +7491,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) + return -EINVAL; + + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9fbb10383434..941c28cf9738 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -5,6 +5,7 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include <asm/irq_regs.h> #include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ @@ -339,7 +340,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = task_pt_regs(tsk); + struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); if (regs && user_mode(regs)) index = CPUACCT_STAT_USER; @@ -347,7 +348,7 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + __this_cpu_add(ca->cpuusage->usages[index], cputime); rcu_read_unlock(); } @@ -363,7 +364,7 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) - this_cpu_ptr(ca->cpustat)->cpustat[index] += val; + __this_cpu_add(ca->cpustat->cpustat[index], val); rcu_read_unlock(); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 239970b991c0..36c54265bb2b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -258,7 +258,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ @@ -437,7 +437,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), @@ -464,10 +464,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID tree-key switches prio" " wait-time sum-exec sum-sleep\n"); SEQ_printf(m, "-------------------------------------------------------" - "----------------------------------------------------\n"); + "------------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -638,7 +638,6 @@ do { \ P(nr_running); P(nr_switches); - P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52c82b2c94dc..35f4cc024dcf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -191,7 +191,7 @@ static void update_sysctl(void) #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } @@ -1093,7 +1093,7 @@ struct numa_group { * more by CPU use than by memory faults. */ unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* @@ -3440,52 +3440,46 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + se->avg.util_sum = se->avg.util_avg * divider; /* Update parent cfs_rq utilization */ add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* Nothing to update */ if (!delta) return; - /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. - */ - /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; + se->avg.runnable_sum = se->avg.runnable_avg * divider; /* Update parent cfs_rq runnable */ add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; } static inline void @@ -3495,19 +3489,26 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq unsigned long load_avg; u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + if (runnable_sum >= 0) { /* * Add runnable; clip at LOAD_AVG_MAX. Reflects that until * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by @@ -3532,7 +3533,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_avg = div_s64(load_sum, divider); delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; @@ -3696,6 +3697,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; /* @@ -3872,6 +3877,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -4053,7 +4060,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -4587,16 +4594,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) } /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4608,13 +4615,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; } +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ @@ -4703,13 +4722,33 @@ static int tg_throttle_down(struct task_group *tg, void *data) return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; - bool empty; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4743,29 +4782,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!se) sub_nr_running(rq, task_delta); - cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. - */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq_clock(rq); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4932,14 +4955,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u /* * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - cfs_b->distribute_running = 1; + while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); } @@ -5053,10 +5074,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->slack_started = false; - if (cfs_b->distribute_running) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); @@ -5066,9 +5083,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) @@ -5077,7 +5091,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } @@ -5138,8 +5151,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -5169,6 +5181,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5198,8 +5212,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; @@ -5220,7 +5232,6 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; cfs_b->slack_started = false; } @@ -5505,28 +5516,27 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) list_add_leaf_cfs_rq(cfs_rq); } -enqueue_throttle: - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() @@ -5736,7 +5746,7 @@ static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); @@ -5845,8 +5855,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag); +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5929,7 +5938,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -6670,9 +6679,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -8583,7 +8589,7 @@ static int idle_cpu_without(int cpu, struct task_struct *p) */ #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -8701,8 +8707,7 @@ static bool update_pick_idlest(struct sched_group *idlest, * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -9433,7 +9438,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; /* * Ensure the balancing environment is consistent; can happen @@ -9454,18 +9459,12 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* @@ -9818,9 +9817,8 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -9909,9 +9907,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10028,17 +10023,20 @@ static void kick_ilb(unsigned int flags) if (ilb_cpu >= nr_cpu_ids) return; + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* @@ -10376,20 +10374,14 @@ abort: */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - unsigned int flags; + unsigned int flags = this_rq->nohz_idle_balance; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + if (!flags) return false; - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } + this_rq->nohz_idle_balance = 0; - /* could be _relaxed() */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) + if (idle != CPU_IDLE) return false; _nohz_idle_balance(this_rq, flags, idle); @@ -10449,7 +10441,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -10500,9 +10492,6 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 t0, domain_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b743bf38f08f..05deb81bb3e3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -289,7 +289,11 @@ static void do_idle(void) */ smp_mb__after_atomic(); - sched_ttwu_pending(); + /* + * RCU relies on this call to be done outside of an RCU read-side + * critical section. + */ + flush_smp_call_function_from_idle(); schedule_idle(); if (unlikely(klp_patch_pending(current))) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b647d04d9c8b..b4b1ff96642f 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -237,6 +237,30 @@ ___update_load_sum(u64 now, struct sched_avg *sa, return 1; } +/* + * When syncing *_avg with *_sum, we must take into account the current + * position in the PELT segment otherwise the remaining part of the segment + * will be considered as idle time whereas it's not yet elapsed and this will + * generate unwanted oscillation in the range [1002..1024[. + * + * The max value of *_sum varies with the position in the time segment and is + * equals to : + * + * LOAD_AVG_MAX*y + sa->period_contrib + * + * which can be simplified into: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 + * + * The same care must be taken when a sched entity is added, updated or + * removed from a cfs_rq and we need to update sched_avg. Scheduler entities + * and the cfs rq, to which they are attached, have the same position in the + * time segment because they use the same clock. This means that we can use + * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity + * if it's more convenient. + */ static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 45da29de3ecc..f395ddb75f38 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,8 @@ int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg, if (rt_period == 0) return -EINVAL; + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) @@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void) return -EINVAL; if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; return 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index db3a57675ccf..1d4e94c1e5fe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -349,7 +349,6 @@ struct cfs_bandwidth { u8 idle; u8 period_active; - u8 distribute_running; u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; @@ -890,12 +889,15 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; + call_single_data_t nohz_csd; #endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; - atomic_t nohz_flags; + atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned long nr_load_updates; +#ifdef CONFIG_SMP + unsigned int ttwu_pending; +#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -951,6 +953,7 @@ struct rq { struct callback_head *balance_callback; + unsigned char nohz_idle_balance; unsigned char idle_balance; unsigned long misfit_task_load; @@ -979,7 +982,7 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1020,10 +1023,6 @@ struct rq { unsigned int ttwu_local; #endif -#ifdef CONFIG_SMP - struct llist_head wake_list; -#endif - #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -1367,8 +1366,6 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -extern void sched_ttwu_pending(void); - #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -1461,7 +1458,7 @@ struct sched_group { * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) */ - unsigned long cpumask[0]; + unsigned long cpumask[]; }; static inline struct cpumask *sched_group_span(struct sched_group *sg) @@ -1504,15 +1501,11 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); - -#else - -static inline void sched_ttwu_pending(void) { } +extern void flush_smp_call_function_from_idle(void); -static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } - -#endif /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ +static inline void flush_smp_call_function_from_idle(void) { } +#endif #include "stats.h" #include "autogroup.h" @@ -1688,7 +1681,8 @@ static inline int task_on_rq_migrating(struct task_struct *p) */ #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x4 /* Internal use, task got migrated */ +#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ +#define WF_ON_RQ 0x08 /* Wakee is on_rq */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1918,6 +1912,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 +#define MAX_BW_BITS (64 - BW_SHIFT) +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h new file mode 100644 index 000000000000..9620e323162c --- /dev/null +++ b/kernel/sched/smp.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Scheduler internal SMP callback types and methods between the scheduler + * and other internal parts of the core kernel: + */ + +extern void sched_ttwu_pending(void *arg); + +extern void send_call_function_single_ipi(int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index fa64b2ee9fe6..ba81187bb7af 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -33,14 +33,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); - - if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); - return -1; - } - printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); @@ -151,8 +143,7 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | + if (sd->flags & (SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | @@ -183,15 +174,14 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + pflags &= ~(SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -1351,8 +1341,7 @@ sd_init(struct sched_domain_topology_level *tl, .cache_nice_tries = 0, - .flags = 1*SD_LOAD_BALANCE - | 1*SD_BALANCE_NEWIDLE + .flags = 1*SD_BALANCE_NEWIDLE | 1*SD_BALANCE_EXEC | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE diff --git a/kernel/scs.c b/kernel/scs.c new file mode 100644 index 000000000000..222a7a9ad543 --- /dev/null +++ b/kernel/scs.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shadow Call Stack support. + * + * Copyright (C) 2019 Google LLC + */ + +#include <linux/kasan.h> +#include <linux/mm.h> +#include <linux/scs.h> +#include <linux/slab.h> +#include <linux/vmstat.h> + +static struct kmem_cache *scs_cache; + +static void __scs_account(void *s, int account) +{ + struct page *scs_page = virt_to_page(s); + + mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB, + account * (SCS_SIZE / SZ_1K)); +} + +static void *scs_alloc(int node) +{ + void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + + if (!s) + return NULL; + + *__scs_magic(s) = SCS_END_MAGIC; + + /* + * Poison the allocation to catch unintentional accesses to + * the shadow stack when KASAN is enabled. + */ + kasan_poison_object_data(scs_cache, s); + __scs_account(s, 1); + return s; +} + +static void scs_free(void *s) +{ + __scs_account(s, -1); + kasan_unpoison_object_data(scs_cache, s); + kmem_cache_free(scs_cache, s); +} + +void __init scs_init(void) +{ + scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); +} + +int scs_prepare(struct task_struct *tsk, int node) +{ + void *s = scs_alloc(node); + + if (!s) + return -ENOMEM; + + task_scs(tsk) = task_scs_sp(tsk) = s; + return 0; +} + +static void scs_check_usage(struct task_struct *tsk) +{ + static unsigned long highest; + + unsigned long *p, prev, curr = highest, used = 0; + + if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE)) + return; + + for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) { + if (!READ_ONCE_NOCHECK(*p)) + break; + used++; + } + + while (used > curr) { + prev = cmpxchg_relaxed(&highest, curr, used); + + if (prev == curr) { + pr_info("%s (%d): highest shadow stack usage: %lu bytes\n", + tsk->comm, task_pid_nr(tsk), used); + break; + } + + curr = prev; + } +} + +void scs_release(struct task_struct *tsk) +{ + void *s = task_scs(tsk); + + if (!s) + return; + + WARN(task_scs_end_corrupted(tsk), + "corrupted shadow stack detected when freeing task\n"); + scs_check_usage(tsk); + scs_free(s); +} diff --git a/kernel/signal.c b/kernel/signal.c index 284fc1600063..5ca48cc5da76 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } diff --git a/kernel/smp.c b/kernel/smp.c index 786092aabdcd..472c2b274c65 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -22,11 +22,9 @@ #include <linux/hypervisor.h> #include "smpboot.h" +#include "sched/smp.h" -enum { - CSD_FLAG_LOCK = 0x01, - CSD_FLAG_SYNCHRONOUS = 0x02, -}; +#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -84,6 +82,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * still pending. */ flush_smp_call_function_queue(false); + irq_work_run(); return 0; } @@ -134,15 +133,33 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); +void __smp_call_single_queue(int cpu, struct llist_node *node) +{ + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (llist_add(node, &per_cpu(call_single_queue, cpu))) + send_call_function_single_ipi(cpu); +} + /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd, - smp_call_func_t func, void *info) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { + smp_call_func_t func = csd->func; + void *info = csd->info; unsigned long flags; /* @@ -156,28 +173,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd, return 0; } - if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } - csd->func = func; - csd->info = info; - - /* - * The list addition should be visible before sending the IPI - * handler locks the list to pull the entry off it because of - * normal cache coherency rules implied by spinlocks. - * - * If IPIs can go out of order to the cache coherency protocol - * in an architecture, sufficient synchronisation should be added - * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really - * equipped to do the right thing... - */ - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - arch_send_call_function_single_ipi(cpu); + __smp_call_single_queue(cpu, &csd->llist); return 0; } @@ -209,9 +210,9 @@ void generic_smp_call_function_single_interrupt(void) */ static void flush_smp_call_function_queue(bool warn_cpu_offline) { - struct llist_head *head; - struct llist_node *entry; call_single_data_t *csd, *csd_next; + struct llist_node *entry, *prev; + struct llist_head *head; static bool warned; lockdep_assert_irqs_disabled(); @@ -230,32 +231,99 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) - pr_warn("IPI callback %pS sent to offline CPU\n", - csd->func); + llist_for_each_entry(csd, entry, llist) { + switch (CSD_TYPE(csd)) { + case CSD_TYPE_ASYNC: + case CSD_TYPE_SYNC: + case CSD_TYPE_IRQ_WORK: + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + break; + + case CSD_TYPE_TTWU: + pr_warn("IPI task-wakeup sent to offline CPU\n"); + break; + + default: + pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", + CSD_TYPE(csd)); + break; + } + } } + /* + * First; run all SYNC callbacks, people are waiting for us. + */ + prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, llist) { - smp_call_func_t func = csd->func; - void *info = csd->info; - /* Do we wait until *after* callback? */ - if (csd->flags & CSD_FLAG_SYNCHRONOUS) { + if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } + func(info); csd_unlock(csd); } else { - csd_unlock(csd); - func(info); + prev = &csd->llist; } } + if (!entry) + return; + /* - * Handle irq works queued remotely by irq_work_queue_on(). - * Smp functions above are typically synchronous so they - * better run first since some other CPUs may be busy waiting - * for them. + * Second; run all !SYNC callbacks. */ - irq_work_run(); + prev = NULL; + llist_for_each_entry_safe(csd, csd_next, entry, llist) { + int type = CSD_TYPE(csd); + + if (type != CSD_TYPE_TTWU) { + if (prev) { + prev->next = &csd_next->llist; + } else { + entry = &csd_next->llist; + } + + if (type == CSD_TYPE_ASYNC) { + smp_call_func_t func = csd->func; + void *info = csd->info; + + csd_unlock(csd); + func(info); + } else if (type == CSD_TYPE_IRQ_WORK) { + irq_work_single(csd); + } + + } else { + prev = &csd->llist; + } + } + + /* + * Third; only CSD_TYPE_TTWU is left, issue those. + */ + if (entry) + sched_ttwu_pending(entry); +} + +void flush_smp_call_function_from_idle(void) +{ + unsigned long flags; + + if (llist_empty(this_cpu_ptr(&call_single_queue))) + return; + + local_irq_save(flags); + flush_smp_call_function_queue(true); + local_irq_restore(flags); } /* @@ -271,7 +339,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }; int this_cpu; int err; @@ -305,7 +373,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd_lock(csd); } - err = generic_exec_single(cpu, csd, func, info); + csd->func = func; + csd->info = info; + + err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); @@ -351,7 +422,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) csd->flags = CSD_FLAG_LOCK; smp_wmb(); - err = generic_exec_single(cpu, csd, csd->func, csd->info); + err = generic_exec_single(cpu, csd); out: preempt_enable(); @@ -466,7 +537,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_FLAG_SYNCHRONOUS; + csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) @@ -598,6 +669,24 @@ void __init smp_init(void) { int num_nodes, num_cpus; + /* + * Ensure struct irq_work layout matches so that + * flush_smp_call_function_queue() can do horrible things. + */ + BUILD_BUG_ON(offsetof(struct irq_work, llnode) != + offsetof(struct __call_single_data, llist)); + BUILD_BUG_ON(offsetof(struct irq_work, func) != + offsetof(struct __call_single_data, func)); + BUILD_BUG_ON(offsetof(struct irq_work, flags) != + offsetof(struct __call_single_data, flags)); + + /* + * Assert the CSD_TYPE_TTWU layout is similar enough + * for task_struct to be on the @call_single_queue. + */ + BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != + offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); + idle_threads_init(); cpuhp_threads_init(); @@ -620,7 +709,7 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -void on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..891667a49bb7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) @@ -2634,6 +2634,7 @@ struct compat_sysinfo { COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) { struct sysinfo s; + struct compat_sysinfo s_32; do_sysinfo(&s); @@ -2658,23 +2659,23 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(info, sizeof(struct compat_sysinfo)) || - __put_user(s.uptime, &info->uptime) || - __put_user(s.loads[0], &info->loads[0]) || - __put_user(s.loads[1], &info->loads[1]) || - __put_user(s.loads[2], &info->loads[2]) || - __put_user(s.totalram, &info->totalram) || - __put_user(s.freeram, &info->freeram) || - __put_user(s.sharedram, &info->sharedram) || - __put_user(s.bufferram, &info->bufferram) || - __put_user(s.totalswap, &info->totalswap) || - __put_user(s.freeswap, &info->freeswap) || - __put_user(s.procs, &info->procs) || - __put_user(s.totalhigh, &info->totalhigh) || - __put_user(s.freehigh, &info->freehigh) || - __put_user(s.mem_unit, &info->mem_unit)) + memset(&s_32, 0, sizeof(s_32)); + s_32.uptime = s.uptime; + s_32.loads[0] = s.loads[0]; + s_32.loads[1] = s.loads[1]; + s_32.loads[2] = s.loads[2]; + s_32.totalram = s.totalram; + s_32.freeram = s.freeram; + s_32.sharedram = s.sharedram; + s_32.bufferram = s.bufferram; + s_32.totalswap = s.totalswap; + s_32.freeswap = s.freeswap; + s_32.procs = s.procs; + s_32.totalhigh = s.totalhigh; + s_32.freehigh = s.freehigh; + s_32.mem_unit = s.mem_unit; + if (copy_to_user(info, &s_32, sizeof(s_32))) return -EFAULT; - return 0; } #endif /* CONFIG_COMPAT */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 17c7633d90fc..7efe0d499c1d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2333,30 +2333,32 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) + +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#if defined(CONFIG_X86) { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_DEBUG_STACKOVERFLOW { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#endif { .procname = "bootloader_type", .data = &bootloader_type, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 53bce347cd50..5d9fc22d836a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +static int timens_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); int err; @@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; timens_set_vvar_page(current, ns); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 743647005f64..24876faac753 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool -config HAVE_FTRACE_NMI_ENTER - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_FUNCTION_TRACER bool help @@ -72,11 +67,6 @@ config RING_BUFFER select TRACE_CLOCK select IRQ_WORK -config FTRACE_NMI_ENTER - bool - depends on HAVE_FTRACE_NMI_ENTER - default y - config EVENT_TRACING select CONTEXT_SWITCH_TRACER select GLOB @@ -158,6 +148,7 @@ config FUNCTION_TRACER select CONTEXT_SWITCH_TRACER select GLOB select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ca39dc3230cb..ea47f2084087 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -170,10 +170,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c41186417d93..3744372a24e2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1729,7 +1729,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bd030b1b9514..b5765aeea698 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -160,17 +160,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_rcu(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ @@ -256,7 +245,7 @@ static void update_ftrace_function(void) * Make sure all CPUs see this. Yes this is slow, but static * tracing is slow and nasty to have enabled. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* Now all cpus are using the list ops. */ function_trace_op = set_function_trace_op; /* Make sure the function_trace_op is visible on all CPUs */ @@ -2932,7 +2921,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); /* * When the kernel is preeptive, tasks can be preempted @@ -5888,7 +5877,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - schedule_on_each_cpu(ftrace_sync); + synchronize_rcu_tasks_rude(); free_ftrace_hash(old_hash); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index db0ad96dfd50..3ab27022c20f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6305,13 +6305,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { @@ -6373,7 +6366,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .ops = &tracing_pipe_buf_ops, + .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; ssize_t ret; @@ -7582,9 +7575,7 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -8527,18 +8518,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 4d8e99fdbbbe..fb0691b8a88d 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -19,6 +19,24 @@ /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { @@ -28,11 +46,31 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); NOKPROBE_SYMBOL(trace_hardirqs_on); +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_prepare(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + } + +} +EXPORT_SYMBOL(trace_hardirqs_off_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_off_prepare); + void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { @@ -56,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } + lockdep_hardirqs_on_prepare(CALLER_ADDR0); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..87804e0371fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns) put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; @@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - cred = prepare_creds(); + cred = nsset_cred(nsset); if (!cred) - return -ENOMEM; + return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); - return commit_creds(cred); + return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index f0e491193009..e488d0e2ab45 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns) put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int utsns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); |