x86/mce: Prevent duplicate error records

A legitimate use case of the MCA infrastructure is to have the firmware log all uncorrectable errors and also, have the OS see all correctable errors. The uncorrectable, UCNA errors are usually configured to be reported through an SMI. CMCI, which is the correctable error reporting interrupt, uses SMI too and having both enabled, leads to unnecessary overhead. So what ends up happening is, people disable CMCI in the wild and leave on only the UCNA SMI. When CMCI is disabled, the MCA infrastructure resorts to polling the MCA banks. If a MCA MSR is shared between the logical threads, one error ends up getting logged multiple times as the polling runs on every logical thread. Therefore, introduce locking on the Intel side of the polling routine to prevent such duplicate error records from appearing. Based on a patch by Aristeu Rozanski <aris@ruivo.org>. Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Tested-by: Tony Luck <tony.luck@intel.com> Acked-by: Aristeu Rozanski <aris@ruivo.org> Link: https://lore.kernel.org/r/20230515143225.GC4090740@cathedrallabs.org
author: Borislav Petkov (AMD) <bp@alien8.de> 2023-07-19 14:19:50 +0200
committer: Borislav Petkov (AMD) <bp@alien8.de> 2023-07-21 18:55:46 +0200
commit: c3629dd7e67d6ec5705d33b0de0d142c972fe573 (patch)
tree: 2e344dd47063840314a002e9ddb08d5cd316e283 /arch/x86/kernel/cpu/mce/intel.c
parent: Linux 6.5-rc2 (diff)
download: wireguard-linux-c3629dd7e67d6ec5705d33b0de0d142c972fe573.tar.xz
wireguard-linux-c3629dd7e67d6ec5705d33b0de0d142c972fe573.zip
1 files changed, 18 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 95275a5e57e0..f5323551c1a9 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
  */
 static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
 
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
+
 #define CMCI_THRESHOLD		1
 #define CMCI_POLL_INTERVAL	(30 * HZ)
 #define CMCI_STORM_INTERVAL	(HZ)
@@ -426,12 +433,22 @@ void cmci_disable_bank(int bank)
 	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
 }
 
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+	spin_lock(&cmci_poll_lock);
+	machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+	spin_unlock(&cmci_poll_lock);
+}
+
 void intel_init_cmci(void)
 {
 	int banks;
 
-	if (!cmci_supported(&banks))
+	if (!cmci_supported(&banks)) {
+		mc_poll_banks = cmci_mc_poll_banks;
 		return;
+	}
 
 	mce_threshold_vector = intel_threshold_interrupt;
 	cmci_discover(banks);
author	Borislav Petkov (AMD) <bp@alien8.de>	2023-07-19 14:19:50 +0200
committer	Borislav Petkov (AMD) <bp@alien8.de>	2023-07-21 18:55:46 +0200
commit	c3629dd7e67d6ec5705d33b0de0d142c972fe573 (patch)
tree	2e344dd47063840314a002e9ddb08d5cd316e283 /arch/x86/kernel/cpu/mce/intel.c
parent	Linux 6.5-rc2 (diff)
download	wireguard-linux-c3629dd7e67d6ec5705d33b0de0d142c972fe573.tar.xz wireguard-linux-c3629dd7e67d6ec5705d33b0de0d142c972fe573.zip