From b62928ff55290e47a7d546d427bf7b4ccf5ea944 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 10 Jan 2019 17:36:45 +0200 Subject: x86/MCE: Switch to use the new generic UUID API Switch the code to use the new, generic helpers. Signed-off-by: Andy Shevchenko Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/20190110153645.40649-1-andriy.shevchenko@linux.intel.com --- arch/x86/kernel/cpu/mce/apei.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 1d9b3ce662a0..c038e5c00a59 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -64,11 +64,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); #define CPER_CREATOR_MCE \ - UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ - 0x64, 0x90, 0xb8, 0x9d) + GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) #define CPER_SECTION_TYPE_MCE \ - UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ - 0x04, 0x4a, 0x38, 0xfc) + GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) /* * CPER specification (in UEFI specification 2.3 appendix N) requires @@ -135,7 +135,7 @@ retry: goto out; /* try to skip other type records in storage */ else if (rc != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) goto retry; memcpy(m, &rcd.mce, sizeof(*m)); rc = sizeof(*m); -- cgit v1.2.3-59-g8ed1b From c95b323dcd3598dd7ef5005d6723c1ba3b801093 Mon Sep 17 00:00:00 2001 From: Shirish S Date: Thu, 10 Jan 2019 07:54:40 +0000 Subject: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models MC4_MISC thresholding is not supported on all family 0x15 processors, hence skip the x86_model check when applying the quirk. [ bp: massage commit message. ] Signed-off-by: Shirish S Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/1547106849-3476-2-git-send-email-shirish.s@amd.com --- arch/x86/kernel/cpu/mce/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 672c7225cb1b..d0c54160b439 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1612,11 +1612,10 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) mce_flags.overflow_recov = 1; /* - * Turn off MC4_MISC thresholding banks on those models since + * Turn off MC4_MISC thresholding banks on all models since * they're not supported there. */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + if (c->x86 == 0x15) { int i; u64 hwcr; bool need_toggle; -- cgit v1.2.3-59-g8ed1b From 30aa3d26edb0f3d7992757287eec0ca588a5c259 Mon Sep 17 00:00:00 2001 From: Shirish S Date: Wed, 16 Jan 2019 15:10:40 +0000 Subject: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk The MC4_MISC thresholding quirk needs to be applied during S5 -> S0 and S3 -> S0 state transitions, which follow different code paths. Carve it out into a separate function and call it mce_amd_feature_init() where the two code paths of the state transitions converge. [ bp: massage commit message and the carved out function. ] Signed-off-by: Shirish S Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: Yazen Ghannam Cc: x86-ml Link: https://lkml.kernel.org/r/1547651417-23583-3-git-send-email-shirish.s@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mce/core.c | 29 ----------------------------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 89298c83de53..ed3327342b40 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -545,6 +545,40 @@ out: return offset; } +/* + * Turn off MC4_MISC thresholding banks on all family 0x15 models since + * they're not supported there. + */ +void disable_err_thresholding(struct cpuinfo_x86 *c) +{ + int i; + u64 hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + if (c->x86 != 0x15) + return; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + /* Clear CntP bit safely */ + for (i = 0; i < ARRAY_SIZE(msrs); i++) + msr_clear_bit(msrs[i], 62); + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -552,6 +586,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; + disable_err_thresholding(c); + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index d0c54160b439..6063ae2376b2 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1611,35 +1611,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 0x15 && c->x86_model <= 0xf) mce_flags.overflow_recov = 1; - /* - * Turn off MC4_MISC thresholding banks on all models since - * they're not supported there. - */ - if (c->x86 == 0x15) { - int i; - u64 hwcr; - bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; - - rdmsrl(MSR_K7_HWCR, hwcr); - - /* McStatusWrEn has to be set */ - need_toggle = !(hwcr & BIT(18)); - - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); - - /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) - msr_clear_bit(msrs[i], 62); - - /* restore old settings */ - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); - } } if (c->x86_vendor == X86_VENDOR_INTEL) { -- cgit v1.2.3-59-g8ed1b From 9359a8cbcc77d4afb07444f341e39015d232bd55 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 25 Jan 2019 16:30:35 +0200 Subject: RAS: Use consistent types for UUIDs The commit 297b64c74385 ("ras: acpi / apei: generate trace event for unrecognized CPER section") brought inconsistency in UUID types which are used across the RAS subsystem. Fix this by using guid_t everywhere. Signed-off-by: Andy Shevchenko Signed-off-by: Borislav Petkov Reviewed-by: Christoph Hellwig Cc: "Rafael J. Wysocki" Cc: "Steven Rostedt (VMware)" Cc: Bjorn Helgaas Cc: Thomas Tai Cc: Tony Luck Cc: Tyler Baicar Link: https://lkml.kernel.org/r/20190125143035.81589-1-andriy.shevchenko@linux.intel.com --- drivers/ras/ras.c | 2 +- include/ras/ras_event.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 3f38907320dc..95540ea8dd9d 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -14,7 +14,7 @@ #define TRACE_INCLUDE_PATH ../../include/ras #include -void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id, +void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len) { diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index a0794632fd01..36c5c5e38c1d 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -27,7 +27,7 @@ TRACE_EVENT(extlog_mem_event, TP_PROTO(struct cper_sec_mem_err *mem, u32 err_seq, - const uuid_le *fru_id, + const guid_t *fru_id, const char *fru_text, u8 sev), @@ -39,7 +39,7 @@ TRACE_EVENT(extlog_mem_event, __field(u8, sev) __field(u64, pa) __field(u8, pa_mask_lsb) - __field_struct(uuid_le, fru_id) + __field_struct(guid_t, fru_id) __string(fru_text, fru_text) __field_struct(struct cper_mem_err_compact, data) ), @@ -218,8 +218,8 @@ TRACE_EVENT(arm_event, */ TRACE_EVENT(non_standard_event, - TP_PROTO(const uuid_le *sec_type, - const uuid_le *fru_id, + TP_PROTO(const guid_t *sec_type, + const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, -- cgit v1.2.3-59-g8ed1b From 6e1849ec75792c46a18a2b14b8bd915435ff2a2a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 25 Jan 2019 16:45:57 +0100 Subject: RAS: Add a MAINTAINERS entry This was long overdue and it is needed at least so that Tony and I get CCed on patches. Signed-off-by: Borislav Petkov Acked-by: Tony Luck Acked-by: Ingo Molnar Link: https://lkml.kernel.org/r/20190125174227.10652-1-bp@alien8.de --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4d04cebb4a71..a63f2b52a8dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12767,6 +12767,16 @@ M: Alexandre Bounine S: Maintained F: drivers/rapidio/ +RAS INFRASTRUCTURE +M: Tony Luck +M: Borislav Petkov +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/ras/ +F: include/linux/ras.h +F: include/ras/ras_event.h +F: Documentation/admin-guide/ras.rst + RAYLINK/WEBGEAR 802.11 WIRELESS LAN DRIVER L: linux-wireless@vger.kernel.org S: Orphan -- cgit v1.2.3-59-g8ed1b From cbfa447edd6a3825fdb8a4ffae74ff7208f2d2c0 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:51 +0000 Subject: x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types Add the (HWID, MCATYPE) tuples and names for the new MP5, NBIO, and PCIE SMCA bank types. Also, add their respective error descriptions to the MCE decoding module edac_mce_amd. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Arnd Bergmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Pu Wen Cc: Qiuxu Zhuo Cc: Shirish S Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190201225534.8177-2-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 3 +++ arch/x86/kernel/cpu/mce/amd.c | 12 ++++++++++++ drivers/edac/mce_amd.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c1a812bd5a27..91b65d859ca8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -312,6 +312,9 @@ enum smca_bank_types { SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_SMU, /* System Management Unit */ + SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_NBIO, /* Northbridge IO Unit */ + SMCA_PCIE, /* PCI Express Unit */ N_SMCA_BANK_TYPES }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index ed3327342b40..00f60b8c7e4f 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -93,6 +93,9 @@ static struct smca_bank_name smca_names[] = { [SMCA_PB] = { "param_block", "Parameter Block" }, [SMCA_PSP] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, }; static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = @@ -162,6 +165,15 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* System Management Unit MCA type */ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, + + /* Microprocessor 5 Unit MCA type */ + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + + /* Northbridge IO Unit MCA type */ + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + + /* PCI Express Unit MCA type */ + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index c605089d899f..5ab4ab3f0ce6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -285,6 +285,35 @@ static const char * const smca_smu_mce_desc[] = { "SMU RAM ECC or parity error", }; +static const char * const smca_mp5_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", +}; + +static const char * const smca_nbio_mce_desc[] = { + "ECC or Parity error", + "PCIE error", + "SDP ErrEvent error", + "SDP Egress Poison Error", + "IOHC Internal Poison Error", +}; + +static const char * const smca_pcie_mce_desc[] = { + "CCIX PER Message logging", + "CCIX Read Response with Status: Non-Data Error", + "CCIX Write Response with Status: Non-Data Error", + "CCIX Read Response with Status: Data Error", + "CCIX Non-okay write response with data error", +}; + struct smca_mce_desc { const char * const *descs; unsigned int num_descs; @@ -304,6 +333,9 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, }; static bool f12h_mc0_mce(u16 ec, u8 xec) -- cgit v1.2.3-59-g8ed1b From 3ad7e748c12cc771df6020a552def3e1727e8a17 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:52 +0000 Subject: x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units The existing CS, PSP, and SMU SMCA bank types will see new versions (as indicated by their McaTypes) in future SMCA systems. Add the new (HWID, MCATYPE) tuples for these new versions. Reuse the same names as the older versions, since they are logically the same to the user. SMCA systems won't mix and match IP blocks with different McaType versions in the same system, so there isn't a need to distinguish them. The MCA_IPID register is saved when logging an MCA error, and that can be used to triage the error. Also, add the new error descriptions to edac_mce_amd. Some error types (positions in the list) are overloaded compared to the previous McaTypes. Therefore, just create new lists of the error descriptions to keep things simple even if some of the error descriptions are the same between versions. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Arnd Bergmann Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Pu Wen Cc: Qiuxu Zhuo Cc: Shirish S Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190201225534.8177-3-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 3 +++ arch/x86/kernel/cpu/mce/amd.c | 6 +++++ drivers/edac/mce_amd.c | 55 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 91b65d859ca8..299a38536567 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -307,11 +307,14 @@ enum smca_bank_types { SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ SMCA_CS, /* Coherent Slave */ + SMCA_CS_V2, /* Coherent Slave */ SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ + SMCA_PSP_V2, /* Platform Security Processor */ SMCA_SMU, /* System Management Unit */ + SMCA_SMU_V2, /* System Management Unit */ SMCA_MP5, /* Microprocessor 5 Unit */ SMCA_NBIO, /* Northbridge IO Unit */ SMCA_PCIE, /* PCI Express Unit */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 00f60b8c7e4f..bd1331b241ca 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -88,11 +88,14 @@ static struct smca_bank_name smca_names[] = { [SMCA_FP] = { "floating_point", "Floating Point Unit" }, [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, [SMCA_UMC] = { "umc", "Unified Memory Controller" }, [SMCA_PB] = { "param_block", "Parameter Block" }, [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, + [SMCA_SMU_V2] = { "smu", "System Management Unit" }, [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, @@ -153,6 +156,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Data Fabric MCA types */ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, @@ -162,9 +166,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Platform Security Processor MCA type */ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, /* System Management Unit MCA type */ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, /* Microprocessor 5 Unit MCA type */ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5ab4ab3f0ce6..184c90172d17 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -257,6 +257,23 @@ static const char * const smca_cs_mce_desc[] = { "ECC error on probe filter access", }; +static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "SDP read response had no match in the CS queue", + "Probe Filter Protocol Error", + "Probe Filter ECC Error", + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", +}; + static const char * const smca_pie_mce_desc[] = { "HW assert", "Internal PIE register security violation", @@ -281,10 +298,45 @@ static const char * const smca_psp_mce_desc[] = { "PSP RAM ECC or parity error", }; +static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Instruction Cache Bank 0 ECC or parity error", + "Instruction Cache Bank 1 ECC or parity error", + "Instruction Tag Ram 0 parity error", + "Instruction Tag Ram 1 parity error", + "Data Cache Bank 0 ECC or parity error", + "Data Cache Bank 1 ECC or parity error", + "Data Cache Bank 2 ECC or parity error", + "Data Cache Bank 3 ECC or parity error", + "Data Tag Bank 0 parity error", + "Data Tag Bank 1 parity error", + "Data Tag Bank 2 parity error", + "Data Tag Bank 3 parity error", + "Dirty Data Ram parity error", + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", +}; + static const char * const smca_smu_mce_desc[] = { "SMU RAM ECC or parity error", }; +static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", +}; + static const char * const smca_mp5_mce_desc[] = { "High SRAM ECC or parity error", "Low SRAM ECC or parity error", @@ -328,11 +380,14 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, -- cgit v1.2.3-59-g8ed1b From 8a5dd2cd2f2e94878cacc969655a69ca214795ab Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:52 +0000 Subject: x86/MCE/AMD, EDAC/mce_amd: Add new error descriptions for some SMCA bank types Some SMCA bank types on future systems will report new error types even though the bank type is not treated as a new version. These new error types will reported by bits that are reserved in past systems. Add the new error descriptions to the lists in edac_mce_amd. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Shirish S Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: https://lkml.kernel.org/r/20190201225534.8177-4-Yazen.Ghannam@amd.com --- arch/x86/kernel/cpu/mce/amd.c | 8 ++++---- drivers/edac/mce_amd.c | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index bd1331b241ca..e64de5149e50 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -144,22 +144,22 @@ static struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, /* Data Fabric MCA types */ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, /* Parameter Block MCA type */ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 184c90172d17..c79e650aa606 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -155,7 +155,7 @@ static const char * const smca_ls_mce_desc[] = { "Store queue parity", "Miss address buffer payload parity", "L1 TLB parity", - "Reserved", + "DC Tag error type 5", "DC tag error type 6", "DC tag error type 1", "Internal error type 1", @@ -222,6 +222,7 @@ static const char * const smca_ex_mce_desc[] = { "Retire status queue parity error", "Scheduling queue parity error", "Branch buffer queue parity error", + "Hardware Assertion error", }; static const char * const smca_fp_mce_desc[] = { @@ -279,6 +280,7 @@ static const char * const smca_pie_mce_desc[] = { "Internal PIE register security violation", "Error on GMI link", "Poison data written to internal PIE register", + "A deferred error was detected in the DF" }; static const char * const smca_umc_mce_desc[] = { @@ -288,6 +290,8 @@ static const char * const smca_umc_mce_desc[] = { "Advanced peripheral bus error", "Command/address parity error", "Write data CRC error", + "DCQ SRAM ECC error", + "AES SRAM ECC error", }; static const char * const smca_pb_mce_desc[] = { -- cgit v1.2.3-59-g8ed1b From e03447ee718b331be8f3abc388c7bf7d325dfab4 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:53 +0000 Subject: EDAC, mce_amd: Match error descriptions to latest documentation Update the error descriptions to match the latest documentation for easier searching. In some cases the changes are small and in other cases the changes may be total rewording of the description. No functional changes. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Tony Luck Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190201225534.8177-5-Yazen.Ghannam@amd.com --- drivers/edac/mce_amd.c | 166 ++++++++++++++++++++++++------------------------- 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index c79e650aa606..7e29ceabdf6f 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -151,74 +151,74 @@ static const char * const mc6_mce_desc[] = { /* Scalable MCA error strings */ static const char * const smca_ls_mce_desc[] = { - "Load queue parity", - "Store queue parity", - "Miss address buffer payload parity", - "L1 TLB parity", + "Load queue parity error", + "Store queue parity error", + "Miss address buffer payload parity error", + "Level 1 TLB parity error", "DC Tag error type 5", - "DC tag error type 6", - "DC tag error type 1", + "DC Tag error type 6", + "DC Tag error type 1", "Internal error type 1", "Internal error type 2", - "Sys Read data error thread 0", - "Sys read data error thread 1", - "DC tag error type 2", - "DC data error type 1 (poison consumption)", - "DC data error type 2", - "DC data error type 3", - "DC tag error type 4", - "L2 TLB parity", + "System Read Data Error Thread 0", + "System Read Data Error Thread 1", + "DC Tag error type 2", + "DC Data error type 1 and poison consumption", + "DC Data error type 2", + "DC Data error type 3", + "DC Tag error type 4", + "Level 2 TLB parity error", "PDC parity error", - "DC tag error type 3", - "DC tag error type 5", - "L2 fill data error", + "DC Tag error type 3", + "DC Tag error type 5", + "L2 Fill Data error", }; static const char * const smca_if_mce_desc[] = { - "microtag probe port parity error", - "IC microtag or full tag multi-hit error", - "IC full tag parity", - "IC data array parity", - "Decoupling queue phys addr parity error", - "L0 ITLB parity error", - "L1 ITLB parity error", - "L2 ITLB parity error", - "BPQ snoop parity on Thread 0", - "BPQ snoop parity on Thread 1", - "L1 BTB multi-match error", - "L2 BTB multi-match error", - "L2 Cache Response Poison error", - "System Read Data error", + "Op Cache Microtag Probe Port Parity Error", + "IC Microtag or Full Tag Multi-hit Error", + "IC Full Tag Parity Error", + "IC Data Array Parity Error", + "Decoupling Queue PhysAddr Parity Error", + "L0 ITLB Parity Error", + "L1 ITLB Parity Error", + "L2 ITLB Parity Error", + "BPQ Thread 0 Snoop Parity Error", + "BPQ Thread 1 Snoop Parity Error", + "L1 BTB Multi-Match Error", + "L2 BTB Multi-Match Error", + "L2 Cache Response Poison Error", + "System Read Data Error", }; static const char * const smca_l2_mce_desc[] = { - "L2M tag multi-way-hit error", - "L2M tag ECC error", - "L2M data ECC error", - "HW assert", + "L2M Tag Multiple-Way-Hit error", + "L2M Tag or State Array ECC Error", + "L2M Data Array ECC Error", + "Hardware Assert Error", }; static const char * const smca_de_mce_desc[] = { - "uop cache tag parity error", - "uop cache data parity error", - "Insn buffer parity error", - "uop queue parity error", - "Insn dispatch queue parity error", - "Fetch address FIFO parity", - "Patch RAM data parity", - "Patch RAM sequencer parity", - "uop buffer parity" + "Micro-op cache tag parity error", + "Micro-op cache data parity error", + "Instruction buffer parity error", + "Micro-op queue parity error", + "Instruction dispatch queue parity error", + "Fetch address FIFO parity error", + "Patch RAM data parity error", + "Patch RAM sequencer parity error", + "Micro-op buffer parity error" }; static const char * const smca_ex_mce_desc[] = { - "Watchdog timeout error", - "Phy register file parity", - "Flag register file parity", - "Immediate displacement register file parity", - "Address generator payload parity", - "EX payload parity", - "Checkpoint queue parity", - "Retire dispatch queue parity", + "Watchdog Timeout error", + "Physical register file parity error", + "Flag register file parity error", + "Immediate displacement register file parity error", + "Address generator payload parity error", + "EX payload parity error", + "Checkpoint queue parity error", + "Retire dispatch queue parity error", "Retire status queue parity error", "Scheduling queue parity error", "Branch buffer queue parity error", @@ -226,36 +226,36 @@ static const char * const smca_ex_mce_desc[] = { }; static const char * const smca_fp_mce_desc[] = { - "Physical register file parity", - "Freelist parity error", - "Schedule queue parity", + "Physical register file (PRF) parity error", + "Freelist (FL) parity error", + "Schedule queue parity error", "NSQ parity error", - "Retire queue parity", - "Status register file parity", + "Retire queue (RQ) parity error", + "Status register file (SRF) parity error", "Hardware assertion", }; static const char * const smca_l3_mce_desc[] = { - "Shadow tag macro ECC error", - "Shadow tag macro multi-way-hit error", - "L3M tag ECC error", - "L3M tag multi-way-hit error", - "L3M data ECC error", - "XI parity, L3 fill done channel error", - "L3 victim queue parity", - "L3 HW assert", + "Shadow Tag Macro ECC Error", + "Shadow Tag Macro Multi-way-hit Error", + "L3M Tag ECC Error", + "L3M Tag Multi-way-hit Error", + "L3M Data ECC Error", + "SDP Parity Error or SystemReadDataError from XI", + "L3 Victim Queue Parity Error", + "L3 Hardware Assertion", }; static const char * const smca_cs_mce_desc[] = { - "Illegal request from transport layer", - "Address violation", - "Security violation", - "Illegal response from transport layer", - "Unexpected response", - "Parity error on incoming request or probe response data", - "Parity error on incoming read response data", - "Atomic request parity", - "ECC error on probe filter access", + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "Probe Filter ECC Error", }; static const char * const smca_cs2_mce_desc[] = { @@ -276,30 +276,30 @@ static const char * const smca_cs2_mce_desc[] = { }; static const char * const smca_pie_mce_desc[] = { - "HW assert", - "Internal PIE register security violation", - "Error on GMI link", - "Poison data written to internal PIE register", + "Hardware Assert", + "Register security violation", + "Link Error", + "Poison data consumption", "A deferred error was detected in the DF" }; static const char * const smca_umc_mce_desc[] = { "DRAM ECC error", - "Data poison error on DRAM", + "Data poison error", "SDP parity error", "Advanced peripheral bus error", - "Command/address parity error", + "Address/Command parity error", "Write data CRC error", "DCQ SRAM ECC error", "AES SRAM ECC error", }; static const char * const smca_pb_mce_desc[] = { - "Parameter Block RAM ECC error", + "An ECC error in the Parameter Block RAM array", }; static const char * const smca_psp_mce_desc[] = { - "PSP RAM ECC or parity error", + "An ECC or parity error in a PSP RAM instance", }; static const char * const smca_psp2_mce_desc[] = { @@ -324,7 +324,7 @@ static const char * const smca_psp2_mce_desc[] = { }; static const char * const smca_smu_mce_desc[] = { - "SMU RAM ECC or parity error", + "An ECC or parity error in an SMU RAM instance", }; static const char * const smca_smu2_mce_desc[] = { -- cgit v1.2.3-59-g8ed1b From 1c1522d32ac49065f88e5a8b3d6e3a5613b20118 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 1 Feb 2019 22:55:54 +0000 Subject: EDAC, mce_amd: Print ExtErrorCode and description on a single line Save a log line by printing the extended error code and the description on a single line. This is similar to how errors are printed in other subsystems, e.g. "#, description". If we don't have a valid description then only the number/code is printed. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Tony Luck Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190201225534.8177-6-Yazen.Ghannam@amd.com --- drivers/edac/mce_amd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 7e29ceabdf6f..f286b880f981 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -965,13 +965,12 @@ static void decode_smca_error(struct mce *m) ip_name = smca_get_long_name(bank_type); - pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec); + pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); /* Only print the decode of valid error codes */ if (xec < smca_mce_descs[bank_type].num_descs && (hwid->xec_bitmap & BIT_ULL(xec))) { - pr_emerg(HW_ERR "%s Error: ", ip_name); - pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]); + pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); } if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) -- cgit v1.2.3-59-g8ed1b From 3f4da372ec8e4ce58c17ac4f2e3c8891bbfea17e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 12 Feb 2019 21:24:28 +0000 Subject: EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit Previous AMD systems have had a bit in MCA_STATUS to indicate that an error was detected on a scrub operation. However, this bit was defined differently within different banks and families/models. Starting with Family 17h, MCA_STATUS[40] is either Reserved/Read-as-Zero or defined as "Scrub", for all MCA banks and CPU models. Therefore, this bit can be defined as the "Scrub" bit. Define MCA_STATUS[40] as "Scrub" and decode it in the AMD MCE decoding module for Family 17h and newer systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Pu Wen Cc: Qiuxu Zhuo Cc: Thomas Gleixner Cc: Tony Luck Cc: Vishal Verma Cc: x86-ml Link: https://lkml.kernel.org/r/20190212212417.107049-1-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 1 + drivers/edac/mce_amd.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 299a38536567..22d05e3835f0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -48,6 +48,7 @@ #define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ +#define MCI_STATUS_SCRUB BIT_ULL(40) /* Error detected during scrub operation */ /* * McaX field if set indicates a given bank supports MCA extensions: diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index f286b880f981..b349c22bb386 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1078,6 +1078,9 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (ecc) pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + if (fam >= 0x17) + pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); + pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) -- cgit v1.2.3-59-g8ed1b From a0bcd3c0b8a52ba0eb74371fa6be15ad0390ba67 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 12 Feb 2019 21:24:29 +0000 Subject: EDAC/mce_amd: Decode MCA_STATUS in bit definition order Sort the MCA_STATUS bits in decode output to follow how they are defined in the register. The order is as follows: Bit | Decode ------------ 62 | Over 61 | UC 59 | MiscV 58 | AddrV 57 | PCC 55 | TCC 53 | SyndV 46 | CECC 45 | UECC 44 | Deferred 43 | Poison 40 | Scrub [ bp: Massage a bit. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Mauro Carvalho Chehab Cc: linux-edac Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190212212417.107049-2-Yazen.Ghannam@amd.com --- drivers/edac/mce_amd.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index b349c22bb386..0a1814dad6cf 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1051,26 +1051,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_UC) ? "UE" : (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), - ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), - ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - - if (fam >= 0x15) { - pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); - - /* F15h, bank4, bit 43 is part of McaStatSubCache. */ - if (fam != 0x15 || m->bank != 4) - pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); - } + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); if (boot_cpu_has(X86_FEATURE_SMCA)) { u32 low, high; u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); - if (!rdmsr_safe(addr, &low, &high) && (low & MCI_CONFIG_MCAX)) pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + + pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); } /* do the two bits[14:13] together */ @@ -1078,6 +1070,14 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (ecc) pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + if (fam >= 0x15) { + pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); + + /* F15h, bank4, bit 43 is part of McaStatSubCache. */ + if (fam != 0x15 || m->bank != 4) + pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); + } + if (fam >= 0x17) pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); -- cgit v1.2.3-59-g8ed1b From 41f035a86b5b72a4f947c38e94239d20d595352a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 25 Feb 2019 12:59:40 -0800 Subject: x86/mce: Improve error message when kernel cannot recover, p2 In c7d606f560e4 ("x86/mce: Improve error message when kernel cannot recover") a case was added for a machine check caused by a DATA access to poison memory from the kernel. A case should have been added also for an uncorrectable error during an instruction fetch in the kernel. Add that extra case so the error message now reads: mce: [Hardware Error]: Machine check: Instruction fetch error in kernel Fixes: c7d606f560e4 ("x86/mce: Improve error message when kernel cannot recover") Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Pu Wen Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190225205940.15226-1-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/severity.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index dc3e26e905a3..65201e180fe0 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -165,6 +165,11 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL ), + MCESEV( + PANIC, "Instruction fetch error in kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + KERNEL + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", -- cgit v1.2.3-59-g8ed1b