aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/kernel/cpu/mce/intel.c
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2023-11-15 11:54:48 -0800
committerBorislav Petkov (AMD) <bp@alien8.de>2023-12-15 13:44:12 +0100
commit3ed57b41a4125609e9fd03e32228aec61d95fe1f (patch)
treecbd8d4c52f587ba80c8b1f63489603b68e7fe7c6 /arch/x86/kernel/cpu/mce/intel.c
parentDocumentation: Begin a RAS section (diff)
downloadwireguard-linux-3ed57b41a4125609e9fd03e32228aec61d95fe1f.tar.xz
wireguard-linux-3ed57b41a4125609e9fd03e32228aec61d95fe1f.zip
x86/mce: Remove old CMCI storm mitigation code
When a "storm" of corrected machine check interrupts (CMCI) is detected this code mitigates by disabling CMCI interrupt signalling from all of the banks owned by the CPU that saw the storm. There are problems with this approach: 1) It is very coarse grained. In all likelihood only one of the banks was generating the interrupts, but CMCI is disabled for all. This means Linux may delay seeing and processing errors logged from other banks. 2) Although CMCI stands for Corrected Machine Check Interrupt, it is also used to signal when an uncorrected error is logged. This is a problem because these errors should be handled in a timely manner. Delete all this code in preparation for a finer grained solution. Signed-off-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com> Tested-by: Yazen Ghannam <yazen.ghannam@amd.com> Link: https://lore.kernel.org/r/20231115195450.12963-2-tony.luck@intel.com
Diffstat (limited to 'arch/x86/kernel/cpu/mce/intel.c')
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c145
1 files changed, 0 insertions, 145 deletions
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 52bce533ddcc..fc4ffc434023 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -42,15 +42,6 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -64,21 +55,6 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
static DEFINE_SPINLOCK(cmci_poll_lock);
#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (HZ)
-#define CMCI_STORM_THRESHOLD 15
-
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
-
-static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
@@ -134,124 +110,6 @@ static bool lmce_supported(void)
return tmp & FEAT_CTL_LMCE_ENABLED;
}
-bool mce_intel_cmci_poll(void)
-{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return false;
-
- /*
- * Reset the counter if we've logged an error in the last poll
- * during the storm.
- */
- if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
- else
- this_cpu_dec(cmci_backoff_cnt);
-
- return true;
-}
-
-void mce_intel_hcpu_update(unsigned long cpu)
-{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
-
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
-}
-
-static void cmci_toggle_interrupt_mode(bool on)
-{
- unsigned long flags, *owned;
- int bank;
- u64 val;
-
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-
- if (on)
- val |= MCI_CTL2_CMCI_EN;
- else
- val &= ~MCI_CTL2_CMCI_EN;
-
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
- if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
- (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
- mce_notify_irq();
- return CMCI_STORM_INTERVAL;
- }
-
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
-
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the timer
- * interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- if (!atomic_sub_return(1, &cmci_storm_on_cpus))
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
-
- fallthrough;
-
- case CMCI_STORM_SUBSIDED:
- /*
- * We wait for all CPUs to go back to SUBSIDED state. When that
- * happens we switch back to interrupt mode.
- */
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_toggle_interrupt_mode(true);
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
-
- /* We have shiny weather. Let the poll do whatever it thinks. */
- return interval;
- }
-}
-
-static bool cmci_storm_detect(void)
-{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
-
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
-
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
- }
- __this_cpu_write(cmci_storm_cnt, cnt);
-
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
-
- cmci_toggle_interrupt_mode(false);
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_STORM_INTERVAL);
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
-
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
-
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
@@ -260,9 +118,6 @@ static bool cmci_storm_detect(void)
*/
static void intel_threshold_interrupt(void)
{
- if (cmci_storm_detect())
- return;
-
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}