From 67701ae9767534534d3710664037dfde2cc04935 Mon Sep 17 00:00:00 2001 From: Jack F Vogel Date: Sun, 1 May 2005 08:58:48 -0700 Subject: [PATCH] check nmi watchdog is broken A bug against an xSeries system showed up recently noting that the check_nmi_watchdog() test was failing. I have been investigating it and discovered in both i386 and x86_64 the recent change to the routine to use the cpu_callin_map has uncovered a problem. Prior to that change, on an SMP box, the test was trivally passing because all cpu's were found to not yet be online, but now with the callin_map they are discovered, it goes on to test the counter and they have not yet begun to increment, so it announces a CPU is stuck and bails out. On all the systems I have access to test, the announcement of failure is also bougs... by the time you can login and check /proc/interrupts, the NMI count is happily incrementing on all CPUs. Its just that the test is being done too early. I have tried moving the call to the test around a bit, and it was always too early. I finally hit on this proposed solution, it delays the routine via a late_initcall(), seems like the right solution to me. Signed-off-by: Adrian Bunk Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/apic.c | 2 -- arch/i386/kernel/io_apic.c | 2 -- arch/i386/kernel/nmi.c | 11 +++++++---- arch/i386/kernel/smpboot.c | 3 --- 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index e3879f7625c2..d509836b70c3 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -1265,8 +1265,6 @@ int __init APIC_init_uniprocessor (void) setup_local_APIC(); - if (nmi_watchdog == NMI_LOCAL_APIC) - check_nmi_watchdog(); #ifdef CONFIG_X86_IO_APIC if (smp_found_config) if (!skip_ioapic_setup && nr_ioapics) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 5e0d55be5435..7a324e8b86f9 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -2175,7 +2175,6 @@ static inline void check_timer(void) disable_8259A_irq(0); setup_nmi(); enable_8259A_irq(0); - check_nmi_watchdog(); } return; } @@ -2198,7 +2197,6 @@ static inline void check_timer(void) add_pin_to_irq(0, 0, pin2); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - check_nmi_watchdog(); } return; } diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 2f89d000f954..2c0ee9c2d020 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -102,20 +102,21 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -int __init check_nmi_watchdog (void) +static int __init check_nmi_watchdog(void) { unsigned int prev_nmi_count[NR_CPUS]; int cpu; - printk(KERN_INFO "testing NMI watchdog ... "); + if (nmi_watchdog == NMI_NONE) + return 0; + + printk(KERN_INFO "Testing NMI watchdog ... "); for (cpu = 0; cpu < NR_CPUS; cpu++) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); mdelay((10*1000)/nmi_hz); // wait 10 ticks - /* FIXME: Only boot CPU is online at this stage. Check CPUs - as they come up. */ for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_SMP /* Check cpu_callin_map here because that is set @@ -139,6 +140,8 @@ int __init check_nmi_watchdog (void) return 0; } +/* This needs to happen later in boot so counters are working */ +late_initcall(check_nmi_watchdog); static int __init setup_nmi_watchdog(char *str) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index fd36d2f65f88..cbea7ac582e5 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1089,9 +1089,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus) } } - if (nmi_watchdog == NMI_LOCAL_APIC) - check_nmi_watchdog(); - smpboot_setup_io_apic(); setup_boot_APIC_clock(); -- cgit v1.2.3-59-g8ed1b