aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/backtracetest.c11
-rw-r--r--kernel/bpf/cpumap.c13
-rw-r--r--kernel/bpf/inode.c32
-rw-r--r--kernel/bpf/verifier.c81
-rw-r--r--kernel/cpu.c35
-rw-r--r--kernel/dma/debug.c14
-rw-r--r--kernel/events/core.c89
-rw-r--r--kernel/events/ring_buffer.c40
-rw-r--r--kernel/iomem.c4
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/irqdesc.c1
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/latencytop.c29
-rw-r--r--kernel/livepatch/transition.c22
-rw-r--r--kernel/locking/lockdep.c107
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcuperf.c5
-rw-r--r--kernel/rcu/rcutorture.c21
-rw-r--r--kernel/rcu/srcutiny.c9
-rw-r--r--kernel/rcu/srcutree.c32
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c508
-rw-r--r--kernel/rcu/tree.h14
-rw-r--r--kernel/rcu/tree_exp.h36
-rw-r--r--kernel/rcu/tree_plugin.h257
-rw-r--r--kernel/rcu/tree_stall.h709
-rw-r--r--kernel/rcu/update.c59
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/rseq.c9
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c1
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c35
-rw-r--r--kernel/seccomp.c19
-rw-r--r--kernel/signal.c15
-rw-r--r--kernel/stacktrace.c333
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c144
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_events_hist.c14
-rw-r--r--kernel/trace/trace_stack.c85
-rw-r--r--kernel/trace/trace_syscalls.c9
-rw-r--r--kernel/watchdog.c6
-rw-r--r--kernel/watchdog_hld.c3
55 files changed, 1634 insertions, 1253 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..62471e75a2b0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..a563c8fdad0d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -48,19 +48,14 @@ static void backtrace_test_irq(void)
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
- struct stack_trace trace;
unsigned long entries[8];
+ unsigned int nr_entries;
pr_info("Testing a saved backtrace.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- trace.nr_entries = 0;
- trace.max_entries = ARRAY_SIZE(entries);
- trace.entries = entries;
- trace.skip = 0;
-
- save_stack_trace(&trace);
- print_stack_trace(&trace, 0);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ stack_trace_print(entries, nr_entries, 0);
}
#else
static void backtrace_test_saved(void)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..3c18260403dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work)
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
struct xdp_frame *xdpf)
{
+ unsigned int hard_start_headroom;
unsigned int frame_size;
void *pkt_data_start;
struct sk_buff *skb;
+ /* Part of headroom was reserved to xdpf */
+ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
+
/* build_skb need to place skb_shared_info after SKB end, and
* also want to know the memory "truesize". Thus, need to
* know the memory frame size backing xdp_buff.
@@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
+ frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- pkt_data_start = xdpf->data - xdpf->headroom;
+ pkt_data_start = xdpf->data - hard_start_headroom;
skb = build_skb(pkt_data_start, frame_size);
if (!skb)
return NULL;
- skb_reserve(skb, xdpf->headroom);
+ skb_reserve(skb, hard_start_headroom);
__skb_put(skb, xdpf->len);
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
@@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* - RX ring dev queue index (skb_record_rx_queue)
*/
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
return skb;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2ada5e21dfa6..4a8f390a2b82 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
-static void bpf_evict_inode(struct inode *inode)
-{
- enum bpf_type type;
-
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
-
- if (S_ISLNK(inode->i_mode))
- kfree(inode->i_link);
- if (!bpf_inode_type(inode, &type))
- bpf_any_put(inode->i_private, type);
-}
-
/*
* Display the mount options in /proc/mounts.
*/
@@ -579,11 +566,28 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void bpf_destroy_inode_deferred(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ enum bpf_type type;
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+ free_inode_nonrcu(inode);
+}
+
+static void bpf_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
+}
+
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
- .evict_inode = bpf_evict_inode,
+ .destroy_inode = bpf_destroy_inode,
};
enum {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fd502c1f71eb..09d5d972c9ff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1897,8 +1897,9 @@ continue_func:
}
frame++;
if (frame >= MAX_CALL_FRAMES) {
- WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
- return -EFAULT;
+ verbose(env, "the call stack of %d frames is too deep !\n",
+ frame);
+ return -E2BIG;
}
goto process_func;
}
@@ -4137,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static void __find_good_pkt_pointers(struct bpf_func_state *state,
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type, u16 new_range)
+{
+ struct bpf_reg_state *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }
+
+ bpf_for_each_spilled_reg(i, state, reg) {
+ if (!reg)
+ continue;
+ if (reg->type == type && reg->id == dst_reg->id)
+ reg->range = max(reg->range, new_range);
+ }
+}
+
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs, *reg;
u16 new_range;
- int i, j;
+ int i;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -4210,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == type && regs[i].id == dst_reg->id)
- /* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, new_range);
-
- for (j = 0; j <= vstate->curframe; j++) {
- state = vstate->frame[j];
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
- }
+ for (i = 0; i <= vstate->curframe; i++)
+ __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
+ new_range);
}
/* compute branch direction of the expression "if (reg opcode val) goto target;"
@@ -4697,6 +4707,22 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
}
+static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
+ bool is_null)
+{
+ struct bpf_reg_state *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
+
+ bpf_for_each_spilled_reg(i, state, reg) {
+ if (!reg)
+ continue;
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }
+}
+
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -4704,10 +4730,10 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs;
u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i, j;
+ int i;
if (ref_obj_id && ref_obj_id == id && is_null)
/* regs[regno] is in the " == NULL" branch.
@@ -4716,17 +4742,8 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
*/
WARN_ON_ONCE(release_reference_state(state, id));
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &regs[i], id, is_null);
-
- for (j = 0; j <= vstate->curframe; j++) {
- state = vstate->frame[j];
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
- }
+ for (i = 0; i <= vstate->curframe; i++)
+ __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 025f419d16f6..43e741e88691 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -564,6 +564,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
+static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
+{
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ /*
+ * When CPU hotplug is disabled, then taking the CPU down is not
+ * possible because takedown_cpu() and the architecture and
+ * subsystem specific mechanisms are not available. So the CPU
+ * which would be completely unplugged again needs to stay around
+ * in the current state.
+ */
+ return st->state <= CPUHP_BRINGUP_CPU;
+}
+
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
@@ -574,8 +588,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
- st->target = prev_state;
- undo_cpu_up(cpu, st);
+ if (can_rollback_cpu(st)) {
+ st->target = prev_state;
+ undo_cpu_up(cpu, st);
+ }
break;
}
}
@@ -2288,3 +2304,18 @@ void __init boot_cpu_hotplug_init(void)
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
+
+enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ if (!strcmp(arg, "off"))
+ cpu_mitigations = CPU_MITIGATIONS_OFF;
+ else if (!strcmp(arg, "auto"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO;
+ else if (!strcmp(arg, "auto,nosmt"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+
+ return 0;
+}
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 45d51e8e26f6..badd77670d00 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -89,8 +89,8 @@ struct dma_debug_entry {
int sg_mapped_ents;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
- struct stack_trace stacktrace;
- unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+ unsigned int stack_len;
+ unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
#endif
};
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
#ifdef CONFIG_STACKTRACE
if (entry) {
pr_warning("Mapped at:\n");
- print_stack_trace(&entry->stacktrace, 0);
+ stack_trace_print(entry->stack_entries, entry->stack_len, 0);
}
#endif
}
@@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)
spin_unlock_irqrestore(&free_entries_lock, flags);
#ifdef CONFIG_STACKTRACE
- entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
- entry->stacktrace.entries = entry->st_entries;
- entry->stacktrace.skip = 2;
- save_stack_trace(&entry->stacktrace);
+ entry->stack_len = stack_trace_save(entry->stack_entries,
+ ARRAY_SIZE(entry->stack_entries),
+ 1);
#endif
-
return entry;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72d06e302e99..dc7dead2d2cc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2009,8 +2009,8 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (event->pending_disable) {
- event->pending_disable = 0;
+ if (READ_ONCE(event->pending_disable) >= 0) {
+ WRITE_ONCE(event->pending_disable, -1);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2198,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- event->pending_disable = 1;
+ WRITE_ONCE(event->pending_disable, smp_processor_id());
+ /* can fail, see perf_pending_event_disable() */
irq_work_queue(&event->pending);
}
@@ -5810,10 +5811,45 @@ void perf_event_wakeup(struct perf_event *event)
}
}
+static void perf_pending_event_disable(struct perf_event *event)
+{
+ int cpu = READ_ONCE(event->pending_disable);
+
+ if (cpu < 0)
+ return;
+
+ if (cpu == smp_processor_id()) {
+ WRITE_ONCE(event->pending_disable, -1);
+ perf_event_disable_local(event);
+ return;
+ }
+
+ /*
+ * CPU-A CPU-B
+ *
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-A;
+ * irq_work_queue();
+ *
+ * sched-out
+ * @pending_disable = -1;
+ *
+ * sched-in
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-B;
+ * irq_work_queue(); // FAILS
+ *
+ * irq_work_run()
+ * perf_pending_event()
+ *
+ * But the event runs on CPU-B and wants disabling there.
+ */
+ irq_work_queue_on(&event->pending, cpu);
+}
+
static void perf_pending_event(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry,
- struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;
rctx = perf_swevent_get_recursion_context();
@@ -5822,10 +5858,7 @@ static void perf_pending_event(struct irq_work *entry)
* and we won't recurse 'further'.
*/
- if (event->pending_disable) {
- event->pending_disable = 0;
- perf_event_disable_local(event);
- }
+ perf_pending_event_disable(event);
if (event->pending_wakeup) {
event->pending_wakeup = 0;
@@ -9044,26 +9077,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (task == TASK_TOMBSTONE)
return;
- if (!ifh->nr_file_filters)
- return;
-
- mm = get_task_mm(event->ctx->task);
- if (!mm)
- goto restart;
+ if (ifh->nr_file_filters) {
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
+ }
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- event->addr_filter_ranges[count].start = 0;
- event->addr_filter_ranges[count].size = 0;
+ if (filter->path.dentry) {
+ /*
+ * Adjust base offset if the filter is associated to a
+ * binary that needs to be mapped:
+ */
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
- /*
- * Adjust base offset if the filter is associated to a binary
- * that needs to be mapped:
- */
- if (filter->path.dentry)
perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+ } else {
+ event->addr_filter_ranges[count].start = filter->offset;
+ event->addr_filter_ranges[count].size = filter->size;
+ }
count++;
}
@@ -9071,9 +9107,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);
- up_read(&mm->mmap_sem);
+ if (ifh->nr_file_filters) {
+ up_read(&mm->mmap_sem);
- mmput(mm);
+ mmput(mm);
+ }
restart:
perf_event_stop(event, 1);
@@ -10236,6 +10274,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
+ event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a4047321d7d8..674b35383491 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -392,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = 1;
+ event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
local_set(&rb->aux_nest, 0);
goto err_put;
@@ -455,24 +455,21 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
}
- if (size || handle->aux_flags) {
- /*
- * Only send RECORD_AUX if we have something useful to communicate
- *
- * Note: the OVERWRITE records by themselves are not considered
- * useful, as they don't communicate any *new* information,
- * aside from the short-lived offset, that becomes history at
- * the next event sched-in and therefore isn't useful.
- * The userspace that needs to copy out AUX data in overwrite
- * mode should know to use user_page::aux_head for the actual
- * offset. So, from now on we don't output AUX records that
- * have *only* OVERWRITE flag set.
- */
-
- if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)
- perf_event_aux_event(handle->event, aux_head, size,
- handle->aux_flags);
- }
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ *
+ * Note: the OVERWRITE records by themselves are not considered
+ * useful, as they don't communicate any *new* information,
+ * aside from the short-lived offset, that becomes history at
+ * the next event sched-in and therefore isn't useful.
+ * The userspace that needs to copy out AUX data in overwrite
+ * mode should know to use user_page::aux_head for the actual
+ * offset. So, from now on we don't output AUX records that
+ * have *only* OVERWRITE flag set.
+ */
+ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
rb->user_page->aux_head = rb->aux_head;
if (rb_need_aux_wakeup(rb))
@@ -480,7 +477,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = 1;
+ handle->event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
}
@@ -613,8 +610,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
* PMU requests more than one contiguous chunks of memory
* for SW double buffering
*/
- if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
- !overwrite) {
+ if (!overwrite) {
if (!max_order)
return -EINVAL;
diff --git a/kernel/iomem.c b/kernel/iomem.c
index f7525e14ebc6..93c264444510 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,
*
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
* a pointer into the direct map.
*
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
/* Try all mapping types requested until one returns non-NULL */
if (flags & MEMREMAP_WB) {
/*
- * MEMREMAP_WB is special in that it can be satisifed
+ * MEMREMAP_WB is special in that it can be satisfied
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
* the requested range is potentially in System RAM.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3faef4a77f71..51128bea3846 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1449,6 +1449,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
{
data = data->parent_data;
+
+ if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (data->chip->irq_set_wake)
return data->chip->irq_set_wake(data, on);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 13539e12cd80..9f8a709337cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -558,6 +558,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ mutex_init(&desc[i].request_mutex);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c83e54727131..b1ea30a5540e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -709,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
static int reuse_unused_kprobe(struct kprobe *ap)
{
struct optimized_kprobe *op;
- int ret;
/*
* Unused kprobe MUST be on the way of delayed unoptimizing (means
@@ -720,9 +719,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
/* Optimize it again (remove from op->list) */
- ret = kprobe_optready(ap);
- if (ret)
- return ret;
+ if (!kprobe_optready(ap))
+ return -EINVAL;
optimize_kprobe(ap);
return 0;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 96b4179cee6a..99a5b5f46dc5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry marks end of backtrace: */
+ if (!record)
break;
}
if (same) {
@@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
-/*
- * Iterator to store a backtrace into a latency record entry
- */
-static inline void store_stacktrace(struct task_struct *tsk,
- struct latency_record *lat)
-{
- struct stack_trace trace;
-
- memset(&trace, 0, sizeof(trace));
- trace.max_entries = LT_BACKTRACEDEPTH;
- trace.entries = &lat->backtrace[0];
- save_stack_trace_tsk(tsk, &trace);
-}
-
/**
* __account_scheduler_latency - record an occurred latency
* @tsk - the task struct of the task hitting the latency
@@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
- store_stacktrace(tsk, &lat);
+
+ stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
raw_spin_lock_irqsave(&latency_lock, flags);
@@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry is end of backtrace */
+ if (!record)
break;
}
if (same) {
@@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v)
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
+
if (!bt)
break;
- if (bt == ULONG_MAX)
- break;
+
seq_printf(m, " %ps", (void *)bt);
}
seq_puts(m, "\n");
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 9c89ae8b337a..c53370d596be 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task)
* Determine whether the given stack trace includes any references to a
* to-be-patched or to-be-unpatched function.
*/
-static int klp_check_stack_func(struct klp_func *func,
- struct stack_trace *trace)
+static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
+ unsigned int nr_entries)
{
unsigned long func_addr, func_size, address;
struct klp_ops *ops;
int i;
- for (i = 0; i < trace->nr_entries; i++) {
- address = trace->entries[i];
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (klp_target_state == KLP_UNPATCHED) {
/*
@@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func,
static int klp_check_stack(struct task_struct *task, char *err_buf)
{
static unsigned long entries[MAX_STACK_ENTRIES];
- struct stack_trace trace;
struct klp_object *obj;
struct klp_func *func;
- int ret;
+ int ret, nr_entries;
- trace.skip = 0;
- trace.nr_entries = 0;
- trace.max_entries = MAX_STACK_ENTRIES;
- trace.entries = entries;
- ret = save_stack_trace_tsk_reliable(task, &trace);
+ ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
WARN_ON_ONCE(ret == -ENOSYS);
- if (ret) {
+ if (ret < 0) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d has an unreliable stack\n",
__func__, task->comm, task->pid);
return ret;
}
+ nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
if (!obj->patched)
continue;
klp_for_each_func(obj, func) {
- ret = klp_check_stack_func(func, &trace);
+ ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d is sleeping on function %s\n",
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34cdcbedda49..91c6b89f04df 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -434,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
-static int save_trace(struct stack_trace *trace)
+static int save_trace(struct lock_trace *trace)
{
- trace->nr_entries = 0;
- trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->entries = stack_trace + nr_stack_trace_entries;
-
- trace->skip = 3;
-
- save_stack_trace(trace);
-
- /*
- * Some daft arches put -1 at the end to indicate its a full trace.
- *
- * <rant> this is buggy anyway, since it takes a whole extra entry so a
- * complete trace that maxes out the entries provided will be reported
- * as incomplete, friggin useless </rant>
- */
- if (trace->nr_entries != 0 &&
- trace->entries[trace->nr_entries-1] == ULONG_MAX)
- trace->nr_entries--;
-
- trace->max_entries = trace->nr_entries;
+ unsigned long *entries = stack_trace + nr_stack_trace_entries;
+ unsigned int max_entries;
+ trace->offset = nr_stack_trace_entries;
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->nr_entries = stack_trace_save(entries, max_entries, 3);
nr_stack_trace_entries += trace->nr_entries;
if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
@@ -1207,7 +1192,7 @@ static struct lock_list *alloc_list_entry(void)
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
- struct stack_trace *trace)
+ struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -1426,6 +1411,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
* checking.
*/
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+ unsigned long *entries = stack_trace + trace->offset;
+
+ stack_trace_print(entries, trace->nr_entries, spaces);
+}
+
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
@@ -1438,8 +1430,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
- print_stack_trace(&target->trace, 6);
-
+ print_lock_trace(&target->trace, 6);
return 0;
}
@@ -1533,10 +1524,9 @@ static inline int class_equal(struct lock_list *entry, void *data)
}
static noinline int print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt,
- struct stack_trace *trace)
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1752,7 +1742,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_stack_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces + bit, len);
}
}
printk("%*s }\n", depth, "");
@@ -1777,7 +1767,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_stack_trace(&entry->trace, 2);
+ print_lock_trace(&entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1890,14 +1880,14 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock_name(backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -2170,8 +2160,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, struct stack_trace *trace,
- int (*save)(struct stack_trace *trace))
+ struct held_lock *next, int distance, struct lock_trace *trace)
{
struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
@@ -2209,15 +2198,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
if (unlikely(!ret)) {
- if (!trace->entries) {
+ if (!trace->nr_entries) {
/*
- * If @save fails here, the printing might trigger
- * a WARN but because of the !nr_entries it should
- * not do bad things.
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
*/
- save(trace);
+ save_trace(trace);
}
- return print_circular_bug(&this, target_entry, next, prev, trace);
+ return print_circular_bug(&this, target_entry, next, prev);
}
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@@ -2265,7 +2254,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return print_bfs_bug(ret);
- if (!trace->entries && !save(trace))
+ if (!trace->nr_entries && !save_trace(trace))
return 0;
/*
@@ -2297,14 +2286,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_trace trace = { .nr_entries = 0 };
int depth = curr->lockdep_depth;
struct held_lock *hlock;
- struct stack_trace trace = {
- .nr_entries = 0,
- .max_entries = 0,
- .entries = NULL,
- .skip = 0,
- };
/*
* Debugging checks.
@@ -2330,7 +2314,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
* added:
*/
if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ int ret = check_prev_add(curr, hlock, next, distance,
+ &trace);
if (!ret)
return 0;
@@ -2731,6 +2716,10 @@ static inline int validate_chain(struct task_struct *curr,
{
return 1;
}
+
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+}
#endif
/*
@@ -2827,7 +2816,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -4689,8 +4678,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
/* closed head */
pf = delayed_free.pf + (delayed_free.index ^ 1);
@@ -4702,8 +4691,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
*/
call_rcu_zapped(delayed_free.pf + delayed_free.index);
- graph_unlock();
-out_irq:
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
}
@@ -4744,21 +4733,17 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
{
struct pending_free *pf;
unsigned long flags;
- int locked;
init_data_structures_once();
raw_local_irq_save(flags);
- locked = graph_lock();
- if (!locked)
- goto out_irq;
-
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
pf = get_pending_free();
__lockdep_free_key_range(pf, start, size);
call_rcu_zapped(pf);
-
- graph_unlock();
-out_irq:
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
/*
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ad40a2617063..80a463d31a8d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)
"End of test: SUCCESS");
kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
kfree(cxt.lrsa);
+ cxt.lrsa = NULL;
end:
torture_cleanup_end();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index acee72c0b24b..4b58c907b4b7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_suppress;
+extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
#define rcu_ftrace_dump_stall_suppress() \
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c29761152874..7a6890b23c5f 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -494,6 +494,10 @@ rcu_perf_cleanup(void)
if (torture_cleanup_begin())
return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
@@ -614,6 +618,7 @@ rcu_perf_init(void)
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
firsterr = -EINVAL;
+ cur_ops = NULL;
goto unwind;
}
if (cur_ops->init)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f14d1b18a74f..efaa5b3f4d3f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,7 +299,6 @@ struct rcu_torture_ops {
int irq_capable;
int can_boost;
int extendables;
- int ext_irq_conflict;
const char *name;
};
@@ -592,12 +591,7 @@ static void srcu_torture_init(void)
static void srcu_torture_cleanup(void)
{
- static DEFINE_TORTURE_RANDOM(rand);
-
- if (torture_random(&rand) & 0x800)
- cleanup_srcu_struct(&srcu_ctld);
- else
- cleanup_srcu_struct_quiesced(&srcu_ctld);
+ cleanup_srcu_struct(&srcu_ctld);
srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
}
@@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
unsigned long randmask2 = randmask1 >> 3;
WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
- /* Most of the time lots of bits, half the time only one bit. */
+ /* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
@@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
(!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- !(mask & cur_ops->ext_irq_conflict) &&
- (oldmask & cur_ops->ext_irq_conflict))
- mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */
return mask ?: RCUTORTURE_RDR_RCU;
}
@@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
rcu_torture_fwd_cb_hist();
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
pr_info("%s: Freed %lu RCU callbacks.\n",
@@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void)
cur_ops->cb_barrier();
return;
}
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
@@ -2267,6 +2261,7 @@ rcu_torture_init(void)
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
firsterr = -EINVAL;
+ cur_ops = NULL;
goto unwind;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5d4a39a6505a..44d6606b8325 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+void cleanup_srcu_struct(struct srcu_struct *ssp)
{
WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
- if (quiesced)
- WARN_ON(work_pending(&ssp->srcu_work));
- else
- flush_work(&ssp->srcu_work);
+ flush_work(&ssp->srcu_work);
WARN_ON(ssp->srcu_gp_running);
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
}
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a60b8ba9e1ac..9b761e546de8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
return SRCU_INTERVAL;
}
-/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
@@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- if (quiesced) {
- if (WARN_ON(delayed_work_pending(&ssp->work)))
- return; /* Just leak it! */
- } else {
- flush_delayed_work(&ssp->work);
- }
+ flush_delayed_work(&ssp->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- if (quiesced) {
- if (WARN_ON(timer_pending(&sdp->delay_work)))
- return; /* Just leak it! */
- if (WARN_ON(work_pending(&sdp->work)))
- return; /* Just leak it! */
- } else {
- del_timer_sync(&sdp->delay_work);
- flush_work(&sdp->work);
- }
+ del_timer_sync(&sdp->delay_work);
+ flush_work(&sdp->work);
+ if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
+ return; /* Forgot srcu_barrier(), so just leak it! */
}
if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(ssp))) {
@@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
free_percpu(ssp->sda);
ssp->sda = NULL;
}
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 911bd9076d43..477b4eb44af5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -52,7 +52,7 @@ void rcu_qs(void)
local_irq_save(flags);
if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
- raise_softirq(RCU_SOFTIRQ);
+ raise_softirq_irqoff(RCU_SOFTIRQ);
}
local_irq_restore(flags);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..ec77ec336f58 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
-/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-/* Commandeer a sysrq key to dump RCU's tree. */
-static bool sysrq_rcu;
-module_param(sysrq_rcu, bool, 0444);
/*
* The rcu_scheduler_active variable is initialized to the value
@@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-module_param(kthread_prio, int, 0644);
+module_param(kthread_prio, int, 0444);
/* Delay in jiffies for grace-period initialization delays, debug only. */
@@ -406,7 +401,7 @@ static bool rcu_kick_kthreads;
*/
static ulong jiffies_till_sched_qs = ULONG_MAX;
module_param(jiffies_till_sched_qs, ulong, 0444);
-static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */
+static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
/*
@@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void)
WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
return;
}
+ /* Otherwise, set to third fqs scan, but bound below on large system. */
j = READ_ONCE(jiffies_till_first_fqs) +
2 * READ_ONCE(jiffies_till_next_fqs);
if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
@@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs)
}
/*
- * Show the state of the grace-period kthreads.
- */
-void show_rcu_gp_kthreads(void)
-{
- int cpu;
- unsigned long j;
- unsigned long ja;
- unsigned long jr;
- unsigned long jw;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- j = jiffies;
- ja = j - READ_ONCE(rcu_state.gp_activity);
- jr = j - READ_ONCE(rcu_state.gp_req_activity);
- jw = j - READ_ONCE(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
- rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state,
- rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
- ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
- (long)READ_ONCE(rcu_state.gp_seq),
- (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
- READ_ONCE(rcu_state.gp_flags));
- rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
- continue;
- pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
- (long)rnp->gp_seq_needed);
- if (!rcu_is_leaf_node(rnp))
- continue;
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->gpwrap ||
- ULONG_CMP_GE(rcu_state.gp_seq,
- rdp->gp_seq_needed))
- continue;
- pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)rdp->gp_seq_needed);
- }
- }
- /* sched_show_task(rcu_state.gp_kthread); */
-}
-EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
-
-/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
-{
- show_rcu_gp_kthreads();
-}
-
-static struct sysrq_key_op sysrq_rcudump_op = {
- .handler = sysrq_show_rcu,
- .help_msg = "show-rcu(y)",
- .action_msg = "Show RCU tree",
- .enable_mask = SYSRQ_ENABLE_DUMP,
-};
-
-static int __init rcu_sysrq_init(void)
-{
- if (sysrq_rcu)
- return register_sysrq_key('y', &sysrq_rcudump_op);
- return 0;
-}
-early_initcall(rcu_sysrq_init);
-
-/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Handler for the irq_work request posted when a grace period has
- * gone on for too long, but not yet long enough for an RCU CPU
- * stall warning. Set state appropriately, but just complain if
- * there is unexpected state on entry.
- */
-static void rcu_iw_handler(struct irq_work *iwp)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- rdp = container_of(iwp, struct rcu_data, rcu_iw);
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp);
- if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
- rdp->rcu_iw_gp_seq = rnp->gp_seq;
- rdp->rcu_iw_pending = false;
- }
- raw_spin_unlock_rcu_node(rnp);
-}
-
-/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 0;
}
-static void record_gp_stall_check_time(void)
-{
- unsigned long j = jiffies;
- unsigned long j1;
-
- rcu_state.gp_start = j;
- j1 = rcu_jiffies_till_stall_check();
- /* Record ->gp_start before ->jiffies_stall. */
- smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
- rcu_state.jiffies_resched = j + j1 / 2;
- rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
-}
-
-/*
- * Complain about starvation of grace-period kthread.
- */
-static void rcu_check_gp_kthread_starvation(void)
-{
- struct task_struct *gpk = rcu_state.gp_kthread;
- unsigned long j;
-
- j = jiffies - READ_ONCE(rcu_state.gp_activity);
- if (j > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
- rcu_state.name, j,
- (long)rcu_seq_current(&rcu_state.gp_seq),
- READ_ONCE(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
- if (gpk) {
- pr_err("RCU grace-period kthread stack dump:\n");
- sched_show_task(gpk);
- wake_up_process(gpk);
- }
- }
-}
-
-/*
- * Dump stacks of all tasks running on stalled CPUs. First try using
- * NMIs, but fall back to manual remote stack tracing on architectures
- * that don't support NMI-based stack dumps. The NMI-triggered stack
- * traces are more accurate because they are printed by the target CPU.
- */
-static void rcu_dump_cpu_stacks(void)
-{
- int cpu;
- unsigned long flags;
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-}
-
-/*
- * If too much time has passed in the current grace period, and if
- * so configured, go kick the relevant kthreads.
- */
-static void rcu_stall_kick_kthreads(void)
-{
- unsigned long j;
-
- if (!rcu_kick_kthreads)
- return;
- j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
- if (time_after(jiffies, j) && rcu_state.gp_kthread &&
- (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
- WARN_ONCE(1, "Kicking %s grace-period kthread\n",
- rcu_state.name);
- rcu_ftrace_dump(DUMP_ALL);
- wake_up_process(rcu_state.gp_kthread);
- WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
- }
-}
-
-static void panic_on_rcu_stall(void)
-{
- if (sysctl_panic_on_rcu_stall)
- panic("RCU Stall\n");
-}
-
-static void print_other_cpu_stall(unsigned long gp_seq)
-{
- int cpu;
- unsigned long flags;
- unsigned long gpa;
- unsigned long j;
- int ndetected = 0;
- struct rcu_node *rnp = rcu_get_root();
- long totqlen = 0;
-
- /* Kick and suppress, if so configured. */
- rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
- return;
-
- /*
- * OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
- print_cpu_stall_info_begin();
- rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ndetected += rcu_print_task_stall(rnp);
- if (rnp->qsmask != 0) {
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
- print_cpu_stall_info(cpu);
- ndetected++;
- }
- }
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
- smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
- if (ndetected) {
- rcu_dump_cpu_stacks();
-
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall();
- } else {
- if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
- pr_err("INFO: Stall ended before state dump start\n");
- } else {
- j = jiffies;
- gpa = READ_ONCE(rcu_state.gp_activity);
- pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
- rcu_state.name, j - gpa, j, gpa,
- READ_ONCE(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
- /* In this case, the current CPU might be at fault. */
- sched_show_task(current);
- }
- }
- /* Rewrite if needed in case of slow consoles. */
- if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-
- rcu_check_gp_kthread_starvation();
-
- panic_on_rcu_stall();
-
- rcu_force_quiescent_state(); /* Kick them all. */
-}
-
-static void print_cpu_stall(void)
-{
- int cpu;
- unsigned long flags;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp = rcu_get_root();
- long totqlen = 0;
-
- /* Kick and suppress, if so configured. */
- rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
- return;
-
- /*
- * OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
- print_cpu_stall_info_begin();
- raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
- print_cpu_stall_info(smp_processor_id());
- raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
- jiffies - rcu_state.gp_start,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-
- rcu_check_gp_kthread_starvation();
-
- rcu_dump_cpu_stacks();
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- /* Rewrite if needed in case of slow consoles. */
- if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- panic_on_rcu_stall();
-
- /*
- * Attempt to revive the RCU machinery by forcing a context switch.
- *
- * A context switch would normally allow the RCU state machine to make
- * progress and it could be we're stuck in kernel space without context
- * switches for an entirely unreasonable amount of time.
- */
- set_tsk_need_resched(current);
- set_preempt_need_resched();
-}
-
-static void check_cpu_stall(struct rcu_data *rdp)
-{
- unsigned long gs1;
- unsigned long gs2;
- unsigned long gps;
- unsigned long j;
- unsigned long jn;
- unsigned long js;
- struct rcu_node *rnp;
-
- if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
- !rcu_gp_in_progress())
- return;
- rcu_stall_kick_kthreads();
- j = jiffies;
-
- /*
- * Lots of memory barriers to reject false positives.
- *
- * The idea is to pick up rcu_state.gp_seq, then
- * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
- * another copy of rcu_state.gp_seq. These values are updated in
- * the opposite order with memory barriers (or equivalent) during
- * grace-period initialization and cleanup. Now, a false positive
- * can occur if we get an new value of rcu_state.gp_start and a old
- * value of rcu_state.jiffies_stall. But given the memory barriers,
- * the only way that this can happen is if one grace period ends
- * and another starts between these two fetches. This is detected
- * by comparing the second fetch of rcu_state.gp_seq with the
- * previous fetch from rcu_state.gp_seq.
- *
- * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
- * and rcu_state.gp_start suffice to forestall false positives.
- */
- gs1 = READ_ONCE(rcu_state.gp_seq);
- smp_rmb(); /* Pick up ->gp_seq first... */
- js = READ_ONCE(rcu_state.jiffies_stall);
- smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = READ_ONCE(rcu_state.gp_start);
- smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
- gs2 = READ_ONCE(rcu_state.gp_seq);
- if (gs1 != gs2 ||
- ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
- return; /* No stall or GP completed since entering function. */
- rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
- if (rcu_gp_in_progress() &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall();
-
- } else if (rcu_gp_in_progress() &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2);
- }
-}
-
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
-}
-
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long gp_seq_req, const char *s)
@@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
static void rcu_gp_kthread_wake(void)
{
if ((current == rcu_state.gp_kthread &&
- !in_interrupt() && !in_serving_softirq()) ||
+ !in_irq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) ||
!rcu_state.gp_kthread)
return;
@@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
- rdp->core_needs_qs = false;
-
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
@@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user)
}
/*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
- * have not yet encountered a quiescent state, using the function specified.
- * Also initiate boosting for any threads blocked on the root rcu_node.
- *
- * The caller must have suppressed start of new grace periods.
+ * Scan the leaf rcu_node structures. For each structure on which all
+ * CPUs have reported a quiescent state and on which there are tasks
+ * blocking the current grace period, initiate RCU priority boosting.
+ * Otherwise, invoke the specified function to check dyntick state for
+ * each CPU that has not yet reported a quiescent state.
*/
static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
@@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void)
}
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-/*
- * This function checks for grace-period requests that fail to motivate
- * RCU to come out of its idle mode.
- */
-void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
- const unsigned long gpssdelay)
-{
- unsigned long flags;
- unsigned long j;
- struct rcu_node *rnp_root = rcu_get_root();
- static atomic_t warned = ATOMIC_INIT(0);
-
- if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
- return;
- j = jiffies; /* Expensive access, and in common case don't get here. */
- if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
- time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
- atomic_read(&warned))
- return;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- j = jiffies;
- if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
- time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
- atomic_read(&warned)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- /* Hold onto the leaf lock to make others see warned==1. */
-
- if (rnp_root != rnp)
- raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
- j = jiffies;
- if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
- time_before(j, rcu_state.gp_activity + gpssdelay) ||
- atomic_xchg(&warned, 1)) {
- raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- WARN_ON(1);
- if (rnp_root != rnp)
- raw_spin_unlock_rcu_node(rnp_root);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- show_rcu_gp_kthreads();
-}
-
-/*
- * Do a forward-progress check for rcutorture. This is normally invoked
- * due to an OOM event. The argument "j" gives the time period during
- * which rcutorture would like progress to have been made.
- */
-void rcu_fwd_progress_check(unsigned long j)
-{
- unsigned long cbs;
- int cpu;
- unsigned long max_cbs = 0;
- int max_cpu = -1;
- struct rcu_data *rdp;
-
- if (rcu_gp_in_progress()) {
- pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
- show_rcu_gp_kthreads();
- } else {
- pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
- preempt_disable();
- rdp = this_cpu_ptr(&rcu_data);
- rcu_check_gp_start_stall(rdp->mynode, rdp, j);
- preempt_enable();
- }
- for_each_possible_cpu(cpu) {
- cbs = rcu_get_n_cbs_cpu(cpu);
- if (!cbs)
- continue;
- if (max_cpu < 0)
- pr_info("%s: callbacks", __func__);
- pr_cont(" %d: %lu", cpu, cbs);
- if (cbs <= max_cbs)
- continue;
- max_cbs = cbs;
- max_cpu = cpu;
- }
- if (max_cpu >= 0)
- pr_cont("\n");
-}
-EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
-
/* Perform RCU core processing work for the current CPU. */
static __latent_entropy void rcu_core(struct softirq_action *unused)
{
@@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedite_gp();
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
break;
default:
break;
@@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void)
jiffies_till_first_fqs = d;
if (jiffies_till_next_fqs == ULONG_MAX)
jiffies_till_next_fqs = d;
- if (jiffies_till_sched_qs == ULONG_MAX)
- adjust_jiffies_till_sched_qs();
+ adjust_jiffies_till_sched_qs();
/* If the compile-time values are accurate, just leave. */
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
@@ -3858,5 +3377,6 @@ void __init rcu_init(void)
srcu_init();
}
+#include "tree_stall.h"
#include "tree_exp.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bb4f995f2d3f..e253d11af3c4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
int rcu_dynticks_snap(struct rcu_data *rdp);
-/* Forward declarations for rcutree_plugin.h */
+/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_print_detail_task_stall(void);
-static int rcu_print_task_stall(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
@@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
-static void print_cpu_stall_info_begin(void);
-static void print_cpu_stall_info(int cpu);
-static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static bool rcu_nocb_cpu_needs_barrier(int cpu);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
@@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
+
+/* Forward declarations for tree_stall.h */
+static void record_gp_stall_check_time(void);
+static void rcu_iw_handler(struct irq_work *iwp);
+static void check_cpu_stall(struct rcu_data *rdp);
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..9c990df880d1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -10,6 +10,7 @@
#include <linux/lockdep.h>
static void rcu_exp_handler(void *unused);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
rdp->deferred_qs = true;
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+ t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
@@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused)
*
* If the CPU is fully enabled (or if some buggy RCU-preempt
* read-side critical section is being used from idle), just
- * invoke rcu_preempt_defer_qs() to immediately report the
+ * invoke rcu_preempt_deferred_qs() to immediately report the
* quiescent state. We cannot use rcu_read_unlock_special()
* because we are in an interrupt handler, which will cause that
* function to take an early exit without doing anything.
@@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)
{
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rnp->exp_tasks)
+ return 0;
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ return ndetected;
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)
WARN_ON_ONCE(ret);
}
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 97dba50f6fb2..1102765f91fd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -285,7 +285,7 @@ static void rcu_qs(void)
TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
- current->rcu_read_unlock_special.b.need_qs = false;
+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
}
}
@@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
- */
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
-{
- unsigned long flags;
- struct task_struct *t;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- t = list_entry(rnp->gp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- /*
- * We could be printing a lot while holding a spinlock.
- * Avoid triggering hard lockup.
- */
- touch_nmi_watchdog();
- sched_show_task(t);
- }
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-}
-
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(void)
-{
- struct rcu_node *rnp = rcu_get_root();
-
- rcu_print_detail_task_stall_rnp(rnp);
- rcu_for_each_leaf_node(rnp)
- rcu_print_detail_task_stall_rnp(rnp);
-}
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
- pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
- rnp->level, rnp->grplo, rnp->grphi);
-}
-
-static void rcu_print_task_stall_end(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return 0;
- rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
- }
- rcu_print_task_stall_end();
- return ndetected;
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each that is blocking the current
- * expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rnp->exp_tasks)
- return 0;
- t = list_entry(rnp->exp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
- }
- return ndetected;
-}
-
-/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
@@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user)
/*
* Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
+ * critical section, clean up if so. No need to issue warnings, as
+ * debug_check_no_locks_held() already does this if lockdep is enabled.
+ * Besides, if this function does anything other than just immediately
+ * return, there was a bug of some sort. Spewing warnings from this
+ * function is like as not to simply obscure important prior warnings.
*/
void exit_rcu(void)
{
struct task_struct *t = current;
- if (likely(list_empty(&current->rcu_node_entry)))
+ if (unlikely(!list_empty(&current->rcu_node_entry))) {
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
+ } else if (unlikely(t->rcu_read_lock_nesting)) {
+ t->rcu_read_lock_nesting = 1;
+ } else {
return;
- t->rcu_read_lock_nesting = 1;
- barrier();
- t->rcu_read_unlock_special.b.blocked = true;
+ }
__rcu_read_unlock();
rcu_preempt_deferred_qs(current);
}
@@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static void rcu_preempt_deferred_qs(struct task_struct *t) { }
/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static void rcu_print_detail_task_stall(void)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections that are
- * blocking the current expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
* Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
@@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
- struct task_struct *t;
-
raw_lockdep_assert_held_rcu_node(rnp);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- t = rnp->boost_kthread_task;
- if (t)
- rcu_wake_cond(t, rnp->boost_kthread_status);
+ rcu_wake_cond(rnp->boost_kthread_task,
+ rnp->boost_kthread_status);
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void)
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ".l"[rdp->all_lazy],
- ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
- ".D"[!rdp->tick_nohz_enabled_snap]);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period, then
- * print the number of scheduling clock interrupts the CPU has taken
- * during the time that it has been aware. Otherwise, print the number
- * of RCU grace periods that this CPU is ignorant of, for example, "1"
- * if the CPU was aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(int cpu)
-{
- unsigned long delta;
- char fast_no_hz[72];
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- char *ticks_title;
- unsigned long ticks_value;
-
- /*
- * We could be printing a lot while holding a spinlock. Avoid
- * triggering hard lockup.
- */
- touch_nmi_watchdog();
-
- ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
- if (ticks_value) {
- ticks_title = "GPs behind";
- } else {
- ticks_title = "ticks this GP";
- ticks_value = rdp->ticks_this_gp;
- }
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
- cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
- "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
- !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
- rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
- "!."[!delta],
- ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
- rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz);
-}
-
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
- pr_err("\t");
-}
-
-/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
- rdp->ticks_this_gp = 0;
- rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
- WRITE_ONCE(rdp->last_fqs_resched, jiffies);
-}
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
@@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
*/
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
+ * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
+ * given, a warning is emitted and all CPUs are offloaded.
+ */
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- cpulist_parse(str, rcu_nocb_mask);
+ if (!strcasecmp(str, "all"))
+ cpumask_setall(rcu_nocb_mask);
+ else
+ if (cpulist_parse(str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
return 1;
}
__setup("rcu_nocbs=", rcu_nocb_setup);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..f65a73a97323
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU CPU stall warnings for normal RCU grace periods
+ *
+ * Copyright IBM Corporation, 2019
+ *
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Controlling CPU stall warnings, including delay calculation.
+
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+/* Limit-check stall timeouts specified at boottime and runtime. */
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
+
+/* Don't do RCU CPU stall warnings during long sysrq printouts. */
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
+/* Don't print RCU CPU stall warnings during a kernel panic. */
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
+static void panic_on_rcu_stall(void)
+{
+ if (sysctl_panic_on_rcu_stall)
+ panic("RCU Stall\n");
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Interaction with RCU grace periods
+
+/* Start of new grace period, so record stall time (and forcing times). */
+static void record_gp_stall_check_time(void)
+{
+ unsigned long j = jiffies;
+ unsigned long j1;
+
+ rcu_state.gp_start = j;
+ j1 = rcu_jiffies_till_stall_check();
+ /* Record ->gp_start before ->jiffies_stall. */
+ smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
+ rcu_state.jiffies_resched = j + j1 / 2;
+ rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+ rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+ WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+ unsigned long j;
+
+ if (!rcu_kick_kthreads)
+ return;
+ j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+ if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+ (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+ WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+ rcu_state.name);
+ rcu_ftrace_dump(DUMP_ALL);
+ wake_up_process(rcu_state.gp_kthread);
+ WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+ }
+}
+
+/*
+ * Handler for the irq_work request posted about halfway into the RCU CPU
+ * stall timeout, and used to detect excessive irq disabling. Set state
+ * appropriately, but just complain if there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gp_seq = rnp->gp_seq;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Printing RCU CPU stall warnings
+
+#ifdef CONFIG_PREEMPT
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return 0;
+ pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+ rnp->level, rnp->grplo, rnp->grphi);
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ pr_cont("\n");
+ return ndetected;
+}
+
+#else /* #ifdef CONFIG_PREEMPT */
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT */
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
+#ifdef CONFIG_RCU_FAST_NO_HZ
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+
+ sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+ rdp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ".l"[rdp->all_lazy],
+ ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+ ".D"[!!rdp->tick_nohz_enabled_snap]);
+}
+
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ *cp = '\0';
+}
+
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period, then
+ * print the number of scheduling clock interrupts the CPU has taken
+ * during the time that it has been aware. Otherwise, print the number
+ * of RCU grace periods that this CPU is ignorant of, for example, "1"
+ * if the CPU was aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(int cpu)
+{
+ unsigned long delta;
+ char fast_no_hz[72];
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ char *ticks_title;
+ unsigned long ticks_value;
+
+ /*
+ * We could be printing a lot while holding a spinlock. Avoid
+ * triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+
+ ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
+ if (ticks_value) {
+ ticks_title = "GPs behind";
+ } else {
+ ticks_title = "ticks this GP";
+ ticks_value = rdp->ticks_this_gp;
+ }
+ print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+ delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
+ cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+ "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
+ ticks_value, ticks_title,
+ rcu_dynticks_snap(rdp) & 0xfff,
+ rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+ fast_no_hz);
+}
+
+/* Complain about starvation of grace-period kthread. */
+static void rcu_check_gp_kthread_starvation(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ unsigned long j;
+
+ j = jiffies - READ_ONCE(rcu_state.gp_activity);
+ if (j > 2 * HZ) {
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ rcu_state.name, j,
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ READ_ONCE(rcu_state.gp_flags),
+ gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+ gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ if (gpk) {
+ pr_err("RCU grace-period kthread stack dump:\n");
+ sched_show_task(gpk);
+ wake_up_process(gpk);
+ }
+ }
+}
+
+static void print_other_cpu_stall(unsigned long gp_seq)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
+ int ndetected = 0;
+ struct rcu_node *rnp;
+ long totqlen = 0;
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_cpu_stall_suppress)
+ return;
+
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ndetected += rcu_print_task_stall(rnp);
+ if (rnp->qsmask != 0) {
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ print_cpu_stall_info(cpu);
+ ndetected++;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ if (ndetected) {
+ rcu_dump_cpu_stacks();
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_for_each_leaf_node(rnp)
+ rcu_print_detail_task_stall_rnp(rnp);
+ } else {
+ if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = READ_ONCE(rcu_state.gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+ rcu_state.name, j - gpa, j, gpa,
+ READ_ONCE(jiffies_till_next_fqs),
+ rcu_get_root()->qsmask);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+
+ rcu_check_gp_kthread_starvation();
+
+ panic_on_rcu_stall();
+
+ rcu_force_quiescent_state(); /* Kick them all. */
+}
+
+static void print_cpu_stall(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rcu_get_root();
+ long totqlen = 0;
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_cpu_stall_suppress)
+ return;
+
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
+ print_cpu_stall_info(smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ jiffies - rcu_state.gp_start,
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+
+ rcu_check_gp_kthread_starvation();
+
+ rcu_dump_cpu_stacks();
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ panic_on_rcu_stall();
+
+ /*
+ * Attempt to revive the RCU machinery by forcing a context switch.
+ *
+ * A context switch would normally allow the RCU state machine to make
+ * progress and it could be we're stuck in kernel space without context
+ * switches for an entirely unreasonable amount of time.
+ */
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
+static void check_cpu_stall(struct rcu_data *rdp)
+{
+ unsigned long gs1;
+ unsigned long gs2;
+ unsigned long gps;
+ unsigned long j;
+ unsigned long jn;
+ unsigned long js;
+ struct rcu_node *rnp;
+
+ if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+ !rcu_gp_in_progress())
+ return;
+ rcu_stall_kick_kthreads();
+ j = jiffies;
+
+ /*
+ * Lots of memory barriers to reject false positives.
+ *
+ * The idea is to pick up rcu_state.gp_seq, then
+ * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
+ * another copy of rcu_state.gp_seq. These values are updated in
+ * the opposite order with memory barriers (or equivalent) during
+ * grace-period initialization and cleanup. Now, a false positive
+ * can occur if we get an new value of rcu_state.gp_start and a old
+ * value of rcu_state.jiffies_stall. But given the memory barriers,
+ * the only way that this can happen is if one grace period ends
+ * and another starts between these two fetches. This is detected
+ * by comparing the second fetch of rcu_state.gp_seq with the
+ * previous fetch from rcu_state.gp_seq.
+ *
+ * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
+ * and rcu_state.gp_start suffice to forestall false positives.
+ */
+ gs1 = READ_ONCE(rcu_state.gp_seq);
+ smp_rmb(); /* Pick up ->gp_seq first... */
+ js = READ_ONCE(rcu_state.jiffies_stall);
+ smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+ gps = READ_ONCE(rcu_state.gp_start);
+ smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
+ gs2 = READ_ONCE(rcu_state.gp_seq);
+ if (gs1 != gs2 ||
+ ULONG_CMP_LT(j, js) ||
+ ULONG_CMP_GE(gps, js))
+ return; /* No stall or GP completed since entering function. */
+ rnp = rdp->mynode;
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ if (rcu_gp_in_progress() &&
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall();
+
+ } else if (rcu_gp_in_progress() &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU forward-progress mechanisms, including of callback invocation.
+
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ int cpu;
+ unsigned long j;
+ unsigned long ja;
+ unsigned long jr;
+ unsigned long jw;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ j = jiffies;
+ ja = j - READ_ONCE(rcu_state.gp_activity);
+ jr = j - READ_ONCE(rcu_state.gp_req_activity);
+ jw = j - READ_ONCE(rcu_state.gp_wake_time);
+ pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ rcu_state.name, gp_state_getname(rcu_state.gp_state),
+ rcu_state.gp_state,
+ rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+ ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+ (long)READ_ONCE(rcu_state.gp_seq),
+ (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+ READ_ONCE(rcu_state.gp_flags));
+ rcu_for_each_node_breadth_first(rnp) {
+ if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
+ continue;
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+ rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+ (long)rnp->gp_seq_needed);
+ if (!rcu_is_leaf_node(rnp))
+ continue;
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->gpwrap ||
+ ULONG_CMP_GE(rcu_state.gp_seq,
+ rdp->gp_seq_needed))
+ continue;
+ pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+ cpu, (long)rdp->gp_seq_needed);
+ }
+ }
+ /* sched_show_task(rcu_state.gp_kthread); */
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * This function checks for grace-period requests that fail to motivate
+ * RCU to come out of its idle mode.
+ */
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay)
+{
+ unsigned long flags;
+ unsigned long j;
+ struct rcu_node *rnp_root = rcu_get_root();
+ static atomic_t warned = ATOMIC_INIT(0);
+
+ if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
+ return;
+ j = jiffies; /* Expensive access, and in common case don't get here. */
+ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned))
+ return;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ /* Hold onto the leaf lock to make others see warned==1. */
+
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
+ time_before(j, rcu_state.gp_activity + gpssdelay) ||
+ atomic_xchg(&warned, 1)) {
+ raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ WARN_ON(1);
+ if (rnp_root != rnp)
+ raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ show_rcu_gp_kthreads();
+}
+
+/*
+ * Do a forward-progress check for rcutorture. This is normally invoked
+ * due to an OOM event. The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+ unsigned long cbs;
+ int cpu;
+ unsigned long max_cbs = 0;
+ int max_cpu = -1;
+ struct rcu_data *rdp;
+
+ if (rcu_gp_in_progress()) {
+ pr_info("%s: GP age %lu jiffies\n",
+ __func__, jiffies - rcu_state.gp_start);
+ show_rcu_gp_kthreads();
+ } else {
+ pr_info("%s: Last GP end %lu jiffies ago\n",
+ __func__, jiffies - rcu_state.gp_end);
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_data);
+ rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+ preempt_enable();
+ }
+ for_each_possible_cpu(cpu) {
+ cbs = rcu_get_n_cbs_cpu(cpu);
+ if (!cbs)
+ continue;
+ if (max_cpu < 0)
+ pr_info("%s: callbacks", __func__);
+ pr_cont(" %d: %lu", cpu, cbs);
+ if (cbs <= max_cbs)
+ continue;
+ max_cbs = cbs;
+ max_cpu = cpu;
+ }
+ if (max_cpu >= 0)
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
+
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+ show_rcu_gp_kthreads();
+}
+
+static struct sysrq_key_op sysrq_rcudump_op = {
+ .handler = sysrq_show_rcu,
+ .help_msg = "show-rcu(y)",
+ .action_msg = "Show RCU tree",
+ .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+ if (sysrq_rcu)
+ return register_sysrq_key('y', &sysrq_rcudump_op);
+ return 0;
+}
+early_initcall(rcu_sysrq_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index cbaa976c5945..c3bf44ba42e5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
#ifdef CONFIG_RCU_STALL_COMMON
-
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
-#endif
-
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
module_param(rcu_cpu_stall_suppress, int, 0644);
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
-
-int rcu_jiffies_till_stall_check(void)
-{
- int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
-
- /*
- * Limit check must be consistent with the Kconfig limits
- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
- */
- if (till_stall_check < 3) {
- WRITE_ONCE(rcu_cpu_stall_timeout, 3);
- till_stall_check = 3;
- } else if (till_stall_check > 300) {
- WRITE_ONCE(rcu_cpu_stall_timeout, 300);
- till_stall_check = 300;
- }
- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-
-void rcu_sysrq_start(void)
-{
- if (!rcu_cpu_stall_suppress)
- rcu_cpu_stall_suppress = 2;
-}
-
-void rcu_sysrq_end(void)
-{
- if (rcu_cpu_stall_suppress == 2)
- rcu_cpu_stall_suppress = 0;
-}
-
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static int __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
- return 0;
-}
-early_initcall(check_cpu_stall_init);
-
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/resource.c b/kernel/resource.c
index 92190f62ebc5..8c15f846e8ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
- resource_size_t end = start + size - 1;
+ struct resource res;
int type = 0; int other = 0;
struct resource *p;
+ res.start = start;
+ res.end = start + size - 1;
+
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
bool is_type = (((p->flags & flags) == flags) &&
((desc == IORES_DESC_NONE) ||
(desc == p->desc)));
- if (start >= p->start && start <= p->end)
- is_type ? type++ : other++;
- if (end >= p->start && end <= p->end)
- is_type ? type++ : other++;
- if (p->start >= start && p->end <= end)
+ if (resource_overlaps(p, &res))
is_type ? type++ : other++;
}
read_unlock(&resource_lock);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 25e9a7b60eba..9424ee90589e 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
* - signal delivery,
* and return to user-space.
*
- * This is how we can ensure that the entire rseq critical section,
- * consisting of both the C part and the assembly instruction sequence,
+ * This is how we can ensure that the entire rseq critical section
* will issue the commit instruction only if executed atomically with
* respect to other threads scheduled on the same CPU, and with respect
* to signal handlers.
@@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (current->rseq_len != rseq_len)
+ if (rseq_len != sizeof(*rseq))
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (ret)
return ret;
current->rseq = NULL;
- current->rseq_len = 0;
current->rseq_sig = 0;
return 0;
}
@@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || current->rseq_len != rseq_len)
+ if (current->rseq != rseq || rseq_len != sizeof(*rseq))
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
- current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4778c48a7fda..ade3f2287d1f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,7 +1151,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
return 0;
} else if (task_on_rq_queued(p)) {
/*
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5c41ea367422..3638d2377e3c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -771,6 +771,7 @@ out:
return 0;
fail:
+ kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
- WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
* If the "0-lag time" already passed, decrease the active
* utilization now, instead of starting a timer
*/
- if (zerolag_time < 0) {
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdab7eb6f351..35f3ea375084 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
@@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
@@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
@@ -7784,10 +7813,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7797,7 +7826,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54a0347ca812..3582eeb59893 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -149,7 +149,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
sd->nr = syscall_get_nr(task, regs);
sd->arch = syscall_get_arch();
- syscall_get_arguments(task, regs, 0, 6, args);
+ syscall_get_arguments(task, regs, args);
sd->args[0] = args[0];
sd->args[1] = args[1];
sd->args[2] = args[2];
@@ -502,7 +502,10 @@ out:
*
* Caller must be holding current->sighand->siglock lock.
*
- * Returns 0 on success, -ve on error.
+ * Returns 0 on success, -ve on error, or
+ * - in TSYNC mode: the pid of a thread which was either not in the correct
+ * seccomp mode or did not have an ancestral seccomp filter
+ * - in NEW_LISTENER mode: the fd of the new listener
*/
static long seccomp_attach_filter(unsigned int flags,
struct seccomp_filter *filter)
@@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
+ /*
+ * In the successful case, NEW_LISTENER returns the new listener fd.
+ * But in the failure case, TSYNC returns the thread that died. If you
+ * combine these two flags, there's no way to tell whether something
+ * succeeded or failed. So, let's disallow this combination.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
+ (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
@@ -1304,7 +1317,7 @@ out:
mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
- if (ret < 0) {
+ if (ret) {
listener_f->private_data = NULL;
fput(listener_f);
put_unused_fd(listener);
diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..227ba170298e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3581,7 +3581,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (flags)
return -EINVAL;
- f = fdget_raw(pidfd);
+ f = fdget(pidfd);
if (!f.file)
return -EBADF;
@@ -3605,16 +3605,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (unlikely(sig != kinfo.si_signo))
goto err;
+ /* Only allow sending arbitrary signals to yourself. */
+ ret = -EPERM;
if ((task_pid(current) != pid) &&
- (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) {
- /* Only allow sending arbitrary signals to yourself. */
- ret = -EPERM;
- if (kinfo.si_code != SI_USER)
- goto err;
-
- /* Turn this into a regular kill signal. */
- prepare_kill_siginfo(sig, &kinfo);
- }
+ (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
+ goto err;
} else {
prepare_kill_siginfo(sig, &kinfo);
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f8edee9c792d..27bafc1e271e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -5,41 +5,56 @@
*
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
+#include <linux/sched/task_stack.h>
+#include <linux/sched/debug.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
-void print_stack_trace(struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_print - Print the entries in the stack trace
+ * @entries: Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces: Number of leading spaces to print
+ */
+void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+ int spaces)
{
- int i;
+ unsigned int i;
- if (WARN_ON(!trace->entries))
+ if (WARN_ON(!entries))
return;
- for (i = 0; i < trace->nr_entries; i++)
- printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
+ for (i = 0; i < nr_entries; i++)
+ printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
}
-EXPORT_SYMBOL_GPL(print_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_print);
-int snprint_stack_trace(char *buf, size_t size,
- struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_snprint - Print the entries in the stack trace into a buffer
+ * @buf: Pointer to the print buffer
+ * @size: Size of the print buffer
+ * @entries: Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces: Number of leading spaces to print
+ *
+ * Return: Number of bytes printed.
+ */
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+ unsigned int nr_entries, int spaces)
{
- int i;
- int generated;
- int total = 0;
+ unsigned int generated, i, total = 0;
- if (WARN_ON(!trace->entries))
+ if (WARN_ON(!entries))
return 0;
- for (i = 0; i < trace->nr_entries; i++) {
+ for (i = 0; i < nr_entries && size; i++) {
generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
- (void *)trace->entries[i]);
+ (void *)entries[i]);
total += generated;
-
- /* Assume that generated isn't a negative number */
if (generated >= size) {
buf += size;
size = 0;
@@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size,
return total;
}
-EXPORT_SYMBOL_GPL(snprint_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_snprint);
+
+#ifdef CONFIG_ARCH_STACKWALK
+
+struct stacktrace_cookie {
+ unsigned long *store;
+ unsigned int size;
+ unsigned int skip;
+ unsigned int len;
+};
+
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
+ bool reliable)
+{
+ struct stacktrace_cookie *c = cookie;
+
+ if (c->len >= c->size)
+ return false;
+
+ if (c->skip > 0) {
+ c->skip--;
+ return true;
+ }
+ c->store[c->len++] = addr;
+ return c->len < c->size;
+}
+
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
+ bool reliable)
+{
+ if (in_sched_functions(addr))
+ return true;
+ return stack_trace_consume_entry(cookie, addr, reliable);
+}
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr + 1,
+ };
+
+ arch_stack_walk(consume_entry, &c, current, NULL);
+ return c.len;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task: The task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr + 1,
+ };
+
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ arch_stack_walk(consume_entry, &c, tsk, NULL);
+ put_task_stack(tsk);
+ return c.len;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs: Pointer to pt_regs to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr,
+ };
+
+ arch_stack_walk(consume_entry, &c, current, regs);
+ return c.len;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk: Pointer to the task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: An error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is
+ * reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+ unsigned int size)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ };
+ int ret;
+
+ /*
+ * If the task doesn't have a stack (e.g., a zombie), the stack is
+ * "reliably" empty.
+ */
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
+ put_task_stack(tsk);
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ };
+
+ /* Trace user stack if not a kernel thread */
+ if (!current->mm)
+ return 0;
+
+ arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+ return c.len;
+}
+#endif
+
+#else /* CONFIG_ARCH_STACKWALK */
/*
* Architectures that do not implement save_stack_trace_*()
@@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
return -ENOSYS;
}
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr + 1,
+ };
+
+ save_stack_trace(&trace);
+ return trace.nr_entries;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task: The task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+ unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr + 1,
+ };
+
+ save_stack_trace_tsk(task, &trace);
+ return trace.nr_entries;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs: Pointer to pt_regs to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr,
+ };
+
+ save_stack_trace_regs(regs, &trace);
+ return trace.nr_entries;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk: Pointer to the task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: An error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is
+ * reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+ unsigned int size)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ };
+ int ret = save_stack_trace_tsk_reliable(tsk, &trace);
+
+ return ret ? ret : trace.nr_entries;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ };
+
+ save_stack_trace_user(&trace);
+ return trace.nr_entries;
+}
+#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
+
+#endif /* !CONFIG_ARCH_STACKWALK */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5da394d1ca3..c9ec050bcf46 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -128,6 +128,7 @@ static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
+static unsigned long zero_ul;
static unsigned long one_ul = 1;
static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
@@ -1750,7 +1751,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &zero,
+ .extra1 = &zero_ul,
.extra2 = &long_max,
},
{
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2c97e8c2d29f..0519a8805aab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return ktime_sub(now, alarm->node.expires);
+ return ktime_sub(alarm->node.expires, now);
}
/**
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..930113b9799a 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -272,7 +272,7 @@ static u64 notrace suspended_sched_clock_read(void)
return cd.read_data[seq & 1].epoch_cyc;
}
-static int sched_clock_suspend(void)
+int sched_clock_suspend(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -283,7 +283,7 @@ static int sched_clock_suspend(void)
return 0;
}
-static void sched_clock_resume(void)
+void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..df401463a191 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -487,6 +487,7 @@ void tick_freeze(void)
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
system_state = SYSTEM_SUSPEND;
+ sched_clock_suspend();
timekeeping_suspend();
} else {
tick_suspend_local();
@@ -510,6 +511,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
+ sched_clock_resume();
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..141ab3ab0354 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void);
extern void timekeeping_warp_clock(void);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
+#ifdef CONFIG_GENERIC_SCHED_CLOCK
+extern int sched_clock_suspend(void);
+extern void sched_clock_resume(void);
+#else
+static inline int sched_clock_suspend(void) { return 0; }
+static inline void sched_clock_resume(void) { }
+#endif
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
diff --git a/kernel/torture.c b/kernel/torture.c
index 8faa1a9aaeb9..17b2be9bde12 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
return false;
+ if (num_online_cpus() <= 1)
+ return false; /* Can't offline the last CPU. */
if (verbose > 1)
pr_alert("%s" TORTURE_FLAG
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 26c8ca9bd06b..b920358dd8f7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,6 +33,7 @@
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <trace/events/sched.h>
@@ -6246,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static inline void
+static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
@@ -6306,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
{
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
+NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
+NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
/*
@@ -6337,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
trace_clear_recursion(bit);
}
+NOKPROBE_SYMBOL(ftrace_ops_assist_func);
/**
* ftrace_ops_get_func - get the function a trampoline should call
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41b6f96e5366..4ee8d8aa3d0f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
preempt_disable_notrace();
time = rb_time_stamp(buffer);
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
return time;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..ec439999f387 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
* not modified.
*/
pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list)
+ if (!pid_list) {
+ trace_parser_put(&parser);
return -ENOMEM;
+ }
pid_list->pid_max = READ_ONCE(pid_max);
@@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
if (!pid_list->pids) {
+ trace_parser_put(&parser);
kfree(pid_list);
return -ENOMEM;
}
@@ -2749,12 +2754,21 @@ trace_function(struct trace_array *tr,
#ifdef CONFIG_STACKTRACE
-#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
+#define FTRACE_KSTACK_NESTING 4
+
+#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
+
struct ftrace_stack {
- unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
+ unsigned long calls[FTRACE_KSTACK_ENTRIES];
};
-static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+
+struct ftrace_stacks {
+ struct ftrace_stack stacks[FTRACE_KSTACK_NESTING];
+};
+
+static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct ring_buffer *buffer,
@@ -2763,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
+ unsigned int size, nr_entries;
+ struct ftrace_stack *fstack;
struct stack_entry *entry;
- struct stack_trace trace;
- int use_stack;
- int size = FTRACE_STACK_ENTRIES;
-
- trace.nr_entries = 0;
- trace.skip = skip;
+ int stackidx;
/*
* Add one, for this function and the call to save_stack_trace()
@@ -2777,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip++;
+ skip++;
#endif
/*
@@ -2788,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
preempt_disable_notrace();
- use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
+ stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
+
+ /* This should never happen. If it does, yell once and skip */
+ if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+ goto out;
+
/*
- * We don't need any atomic variables, just a barrier.
- * If an interrupt comes in, we don't care, because it would
- * have exited and put the counter back to what we want.
- * We just need a barrier to keep gcc from moving things
- * around.
+ * The above __this_cpu_inc_return() is 'atomic' cpu local. An
+ * interrupt will either see the value pre increment or post
+ * increment. If the interrupt happens pre increment it will have
+ * restored the counter when it returns. We just need a barrier to
+ * keep gcc from moving things around.
*/
barrier();
- if (use_stack == 1) {
- trace.entries = this_cpu_ptr(ftrace_stack.calls);
- trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
- if (regs)
- save_stack_trace_regs(regs, &trace);
- else
- save_stack_trace(&trace);
-
- if (trace.nr_entries > size)
- size = trace.nr_entries;
- } else
- /* From now on, use_stack is a boolean */
- use_stack = 0;
+ fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
+ size = ARRAY_SIZE(fstack->calls);
- size *= sizeof(unsigned long);
+ if (regs) {
+ nr_entries = stack_trace_save_regs(regs, fstack->calls,
+ size, skip);
+ } else {
+ nr_entries = stack_trace_save(fstack->calls, size, skip);
+ }
+ size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
sizeof(*entry) + size, flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- memset(&entry->caller, 0, size);
-
- if (use_stack)
- memcpy(&entry->caller, trace.entries,
- trace.nr_entries * sizeof(unsigned long));
- else {
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.entries = entry->caller;
- if (regs)
- save_stack_trace_regs(regs, &trace);
- else
- save_stack_trace(&trace);
- }
-
- entry->size = trace.nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
+ entry->size = nr_entries;
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2904,15 +2902,15 @@ void trace_dump_stack(int skip)
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
static DEFINE_PER_CPU(int, user_stack_count);
-void
+static void
ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
- struct stack_trace trace;
if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
@@ -2943,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
entry->tgid = current->tgid;
memset(&entry->caller, 0, sizeof(entry->caller));
- trace.nr_entries = 0;
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.skip = 0;
- trace.entries = entry->caller;
-
- save_stack_trace_user(&trace);
+ stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2957,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
out:
preempt_enable();
}
-
-#ifdef UNUSED
-static void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#else /* CONFIG_USER_STACKTRACE_SUPPORT */
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc)
{
- ftrace_trace_userstack(tr, flags, preempt_count());
}
-#endif /* UNUSED */
+#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* CONFIG_STACKTRACE */
@@ -7025,35 +7017,43 @@ struct buffer_ref {
struct ring_buffer *buffer;
void *page;
int cpu;
- int ref;
+ refcount_t refcount;
};
+static void buffer_ref_release(struct buffer_ref *ref)
+{
+ if (!refcount_dec_and_test(&ref->refcount))
+ return;
+ ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
+ kfree(ref);
+}
+
static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
- if (--ref->ref)
- return;
-
- ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
- kfree(ref);
+ buffer_ref_release(ref);
buf->private = 0;
}
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
- ref->ref++;
+ if (refcount_read(&ref->refcount) > INT_MAX/2)
+ return false;
+
+ refcount_inc(&ref->refcount);
+ return true;
}
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
+ .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
@@ -7066,11 +7066,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
struct buffer_ref *ref =
(struct buffer_ref *)spd->partial[i].private;
- if (--ref->ref)
- return;
-
- ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
- kfree(ref);
+ buffer_ref_release(ref);
spd->partial[i].private = 0;
}
@@ -7125,7 +7121,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- ref->ref = 1;
+ refcount_set(&ref->refcount, 1);
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (IS_ERR(ref->page)) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d80cee49e0eb..639047b259d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef CONFIG_STACKTRACE
-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
- int pc);
-
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
- unsigned long flags, int pc)
-{
-}
-
static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
int skip, int pc)
{
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4ad967453b6f..3ea65cdff30d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
void ftrace_likely_update(struct ftrace_likely_data *f, int val,
int expect, int is_constant)
{
+ unsigned long flags = user_access_save();
+
/* A constant is always correct */
if (is_constant) {
f->constant++;
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
f->data.correct++;
else
f->data.incorrect++;
+
+ user_access_restore(flags);
}
EXPORT_SYMBOL(ftrace_likely_update);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 795aa2038377..a1d20421f4b0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5186,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
u64 var_ref_vals[TRACING_MAP_VARS_MAX];
char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
- struct stack_trace stacktrace;
struct hist_field *key_field;
u64 field_contents;
void *key = NULL;
@@ -5198,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
- stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
- stacktrace.entries = entries;
- stacktrace.nr_entries = 0;
- stacktrace.skip = HIST_STACKTRACE_SKIP;
-
- memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
- save_stack_trace(&stacktrace);
-
+ memset(entries, 0, HIST_STACKTRACE_SIZE);
+ stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
key = entries;
} else {
field_contents = key_field->fn(key_field, elt, rbe, rec);
@@ -5246,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
unsigned int i;
for (i = 0; i < max_entries; i++) {
- if (stacktrace_entries[i] == ULONG_MAX)
+ if (!stacktrace_entries[i])
return;
seq_printf(m, "%*c", 1 + spaces, ' ');
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index eec648a0d673..5d16f73898db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,44 +18,32 @@
#include "trace.h"
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
- { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
+#define STACK_TRACE_ENTRIES 500
-/*
- * Reserve one entry for the passed in ip. This will allow
- * us to remove most or all of the stack size overhead
- * added by the stack tracer itself.
- */
-struct stack_trace stack_trace_max = {
- .max_entries = STACK_TRACE_ENTRIES - 1,
- .entries = &stack_dump_trace[0],
-};
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
-unsigned long stack_trace_max_size;
-arch_spinlock_t stack_trace_max_lock =
+static unsigned int stack_trace_nr_entries;
+static unsigned long stack_trace_max_size;
+static arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
-static int last_stack_tracer_enabled;
-void stack_trace_print(void)
+static void print_max_stack(void)
{
long i;
int size;
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- stack_trace_max.nr_entries);
+ stack_trace_nr_entries);
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
- if (stack_dump_trace[i] == ULONG_MAX)
- break;
- if (i+1 == stack_trace_max.nr_entries ||
- stack_dump_trace[i+1] == ULONG_MAX)
+ for (i = 0; i < stack_trace_nr_entries; i++) {
+ if (i + 1 == stack_trace_nr_entries)
size = stack_trace_index[i];
else
size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -65,16 +53,7 @@ void stack_trace_print(void)
}
}
-/*
- * When arch-specific code overrides this function, the following
- * data should be filled up, assuming stack_trace_max_lock is held to
- * prevent concurrent updates.
- * stack_trace_index[]
- * stack_trace_max
- * stack_trace_max_size
- */
-void __weak
-check_stack(unsigned long ip, unsigned long *stack)
+static void check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
@@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)
stack_trace_max_size = this_size;
- stack_trace_max.nr_entries = 0;
- stack_trace_max.skip = 0;
-
- save_stack_trace(&stack_trace_max);
+ stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
+ ARRAY_SIZE(stack_dump_trace) - 1,
+ 0);
/* Skip over the overhead of the stack tracer itself */
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
+ for (i = 0; i < stack_trace_nr_entries; i++) {
if (stack_dump_trace[i] == ip)
break;
}
@@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)
* Some archs may not have the passed in ip in the dump.
* If that happens, we need to show everything.
*/
- if (i == stack_trace_max.nr_entries)
+ if (i == stack_trace_nr_entries)
i = 0;
/*
@@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)
* loop will only happen once. This code only takes place
* on a new max, so it is far from a fast path.
*/
- while (i < stack_trace_max.nr_entries) {
+ while (i < stack_trace_nr_entries) {
int found = 0;
stack_trace_index[x] = this_size;
p = start;
- for (; p < top && i < stack_trace_max.nr_entries; p++) {
- if (stack_dump_trace[i] == ULONG_MAX)
- break;
+ for (; p < top && i < stack_trace_nr_entries; p++) {
/*
* The READ_ONCE_NOCHECK is used to let KASAN know that
* this is not a stack-out-of-bounds error.
@@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
- stack_trace_max.nr_entries = x;
- for (; x < i; x++)
- stack_dump_trace[x] = ULONG_MAX;
+ stack_trace_nr_entries = x;
if (task_stack_end_corrupted(current)) {
- stack_trace_print();
+ print_max_stack();
BUG();
}
@@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n >= stack_trace_nr_entries)
return NULL;
m->private = (void *)n;
@@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- stack_trace_max.nr_entries);
+ stack_trace_nr_entries);
if (!stack_tracer_enabled && !stack_trace_max_size)
print_disabled(m);
@@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)
i = *(long *)v;
- if (i >= stack_trace_max.nr_entries ||
- stack_dump_trace[i] == ULONG_MAX)
+ if (i >= stack_trace_nr_entries)
return 0;
- if (i+1 == stack_trace_max.nr_entries ||
- stack_dump_trace[i+1] == ULONG_MAX)
+ if (i + 1 == stack_trace_nr_entries)
size = stack_trace_index[i];
else
size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ int was_enabled;
int ret;
mutex_lock(&stack_sysctl_mutex);
+ was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret || !write ||
- (last_stack_tracer_enabled == !!stack_tracer_enabled))
+ if (ret || !write || (was_enabled == !!stack_tracer_enabled))
goto out;
- last_stack_tracer_enabled = !!stack_tracer_enabled;
-
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
-
out:
mutex_unlock(&stack_sysctl_mutex);
return ret;
@@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)
strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
stack_tracer_enabled = 1;
- last_stack_tracer_enabled = 1;
return 1;
}
__setup("stacktrace", enable_stacktrace);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f93a56d2db27..fa8fbff736d6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct ring_buffer_event *event;
struct ring_buffer *buffer;
unsigned long irq_flags;
+ unsigned long args[6];
int pc;
int syscall_nr;
int size;
@@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry = ring_buffer_event_data(event);
entry->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+ syscall_get_arguments(current, regs, args);
+ memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
@@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ unsigned long args[6];
bool valid_prog_array;
int syscall_nr;
int rctx;
@@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args,
- (unsigned long *)&rec->args);
+ syscall_get_arguments(current, regs, args);
+ memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
!perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 403c9bd90413..6a5787233113 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -554,13 +554,15 @@ static void softlockup_start_all(void)
int lockup_detector_online_cpu(unsigned int cpu)
{
- watchdog_enable(cpu);
+ if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+ watchdog_enable(cpu);
return 0;
}
int lockup_detector_offline_cpu(unsigned int cpu)
{
- watchdog_disable(cpu);
+ if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+ watchdog_disable(cpu);
return 0;
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71381168dede..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
+ this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)