aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc/kernel/nmi.c
blob: 3370945569162dd43ef1252f47c83cf6ad7d87d6 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/* Pseudo NMI support on sparc64 systems.
 *
 * Copyright (C) 2009 David S. Miller <davem@davemloft.net>
 *
 * The NMI watchdog support and infrastructure is based almost
 * entirely upon the x86 NMI support code.
 */
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/export.h>
#include <linux/kprobes.h>
#include <linux/kernel_stat.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <linux/delay.h>
#include <linux/smp.h>

#include <asm/perf_event.h>
#include <asm/ptrace.h>
#include <asm/pcr.h>

#include "kstack.h"

/* We don't have a real NMI on sparc64, but we can fake one
 * up using profiling counter overflow interrupts and interrupt
 * levels.
 *
 * The profile overflow interrupts at level 15, so we use
 * level 14 as our IRQ off level.
 */

static int panic_on_timeout;

/* nmi_active:
 * >0: the NMI watchdog is active, but can be disabled
 * <0: the NMI watchdog has not been set up, and cannot be enabled
 *  0: the NMI watchdog is disabled, but can be enabled
 */
atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
EXPORT_SYMBOL(nmi_active);

static unsigned int nmi_hz = HZ;
static DEFINE_PER_CPU(short, wd_enabled);
static int endflag __initdata;

static DEFINE_PER_CPU(unsigned int, last_irq_sum);
static DEFINE_PER_CPU(long, alert_counter);
static DEFINE_PER_CPU(int, nmi_touch);

void touch_nmi_watchdog(void)
{
	if (atomic_read(&nmi_active)) {
		int cpu;

		for_each_present_cpu(cpu) {
			if (per_cpu(nmi_touch, cpu) != 1)
				per_cpu(nmi_touch, cpu) = 1;
		}
	}

	touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);

static void die_nmi(const char *str, struct pt_regs *regs, int do_panic)
{
	int this_cpu = smp_processor_id();

	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0,
		       pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
		return;

	if (do_panic || panic_on_oops)
		panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
	else
		WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
}

notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
{
	unsigned int sum, touched = 0;
	void *orig_sp;

	clear_softint(1 << irq);

	local_cpu_data().__nmi_count++;

	nmi_enter();

	orig_sp = set_hardirq_stack();

	if (notify_die(DIE_NMI, "nmi", regs, 0,
		       pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
		touched = 1;
	else
		pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);

	sum = local_cpu_data().irq0_irqs;
	if (__get_cpu_var(nmi_touch)) {
		__get_cpu_var(nmi_touch) = 0;
		touched = 1;
	}
	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
		__this_cpu_inc(alert_counter);
		if (__this_cpu_read(alert_counter) == 30 * nmi_hz)
			die_nmi("BUG: NMI Watchdog detected LOCKUP",
				regs, panic_on_timeout);
	} else {
		__get_cpu_var(last_irq_sum) = sum;
		__this_cpu_write(alert_counter, 0);
	}
	if (__get_cpu_var(wd_enabled)) {
		pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
		pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
	}

	restore_hardirq_stack(orig_sp);

	nmi_exit();
}

static inline unsigned int get_nmi_count(int cpu)
{
	return cpu_data(cpu).__nmi_count;
}

static __init void nmi_cpu_busy(void *data)
{
	local_irq_enable_in_hardirq();
	while (endflag == 0)
		mb();
}

static void report_broken_nmi(int cpu, int *prev_nmi_count)
{
	printk(KERN_CONT "\n");

	printk(KERN_WARNING
		"WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
			cpu, prev_nmi_count[cpu], get_nmi_count(cpu));

	printk(KERN_WARNING
		"Please report this to bugzilla.kernel.org,\n");
	printk(KERN_WARNING
		"and attach the output of the 'dmesg' command.\n");

	per_cpu(wd_enabled, cpu) = 0;
	atomic_dec(&nmi_active);
}

void stop_nmi_watchdog(void *unused)
{
	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
	__get_cpu_var(wd_enabled) = 0;
	atomic_dec(&nmi_active);
}

static int __init check_nmi_watchdog(void)
{
	unsigned int *prev_nmi_count;
	int cpu, err;

	if (!atomic_read(&nmi_active))
		return 0;

	prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(unsigned int), GFP_KERNEL);
	if (!prev_nmi_count) {
		err = -ENOMEM;
		goto error;
	}

	printk(KERN_INFO "Testing NMI watchdog ... ");

	smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);

	for_each_possible_cpu(cpu)
		prev_nmi_count[cpu] = get_nmi_count(cpu);
	local_irq_enable();
	mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */

	for_each_online_cpu(cpu) {
		if (!per_cpu(wd_enabled, cpu))
			continue;
		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
			report_broken_nmi(cpu, prev_nmi_count);
	}
	endflag = 1;
	if (!atomic_read(&nmi_active)) {
		kfree(prev_nmi_count);
		atomic_set(&nmi_active, -1);
		err = -ENODEV;
		goto error;
	}
	printk("OK.\n");

	nmi_hz = 1;

	kfree(prev_nmi_count);
	return 0;
error:
	on_each_cpu(stop_nmi_watchdog, NULL, 1);
	return err;
}

void start_nmi_watchdog(void *unused)
{
	__get_cpu_var(wd_enabled) = 1;
	atomic_inc(&nmi_active);

	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
	pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));

	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
}

static void nmi_adjust_hz_one(void *unused)
{
	if (!__get_cpu_var(wd_enabled))
		return;

	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
	pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));

	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
}

void nmi_adjust_hz(unsigned int new_hz)
{
	nmi_hz = new_hz;
	on_each_cpu(nmi_adjust_hz_one, NULL, 1);
}
EXPORT_SYMBOL_GPL(nmi_adjust_hz);

static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p)
{
	on_each_cpu(stop_nmi_watchdog, NULL, 1);
	return 0;
}

static struct notifier_block nmi_reboot_notifier = {
	.notifier_call = nmi_shutdown,
};

int __init nmi_init(void)
{
	int err;

	on_each_cpu(start_nmi_watchdog, NULL, 1);

	err = check_nmi_watchdog();
	if (!err) {
		err = register_reboot_notifier(&nmi_reboot_notifier);
		if (err) {
			on_each_cpu(stop_nmi_watchdog, NULL, 1);
			atomic_set(&nmi_active, -1);
		}
	}

	return err;
}

static int __init setup_nmi_watchdog(char *str)
{
	if (!strncmp(str, "panic", 5))
		panic_on_timeout = 1;

	return 0;
}
__setup("nmi_watchdog=", setup_nmi_watchdog);