From be0aec23bf4624fd55650629fe8df20483487049 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 7 Mar 2016 14:02:18 +0100 Subject: x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors For Scalable MCA enabled processors, errors are listed per IP block. And since it is not required for an IP to map to a particular bank, we need to use HWID and McaType values from the MCx_IPID register to figure out which IP a given bank represents. We also have a new bit (TCC) in the MCx_STATUS register to indicate Task context is corrupt. Add logic here to decode errors from all known IP blocks for Fam17h Model 00-0fh and to print TCC errors. [ Minor fixups. ] Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- drivers/edac/mce_amd.c | 335 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 332 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e3a945ce374b..49768c08ac07 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = { "Status Register File", }; +/* Scalable MCA error strings */ +static const char * const f17h_ls_mce_desc[] = { + "Load queue parity", + "Store queue parity", + "Miss address buffer payload parity", + "L1 TLB parity", + "", /* reserved */ + "DC tag error type 6", + "DC tag error type 1", + "Internal error type 1", + "Internal error type 2", + "Sys Read data error thread 0", + "Sys read data error thread 1", + "DC tag error type 2", + "DC data error type 1 (poison comsumption)", + "DC data error type 2", + "DC data error type 3", + "DC tag error type 4", + "L2 TLB parity", + "PDC parity error", + "DC tag error type 3", + "DC tag error type 5", + "L2 fill data error", +}; + +static const char * const f17h_if_mce_desc[] = { + "microtag probe port parity error", + "IC microtag or full tag multi-hit error", + "IC full tag parity", + "IC data array parity", + "Decoupling queue phys addr parity error", + "L0 ITLB parity error", + "L1 ITLB parity error", + "L2 ITLB parity error", + "BPQ snoop parity on Thread 0", + "BPQ snoop parity on Thread 1", + "L1 BTB multi-match error", + "L2 BTB multi-match error", +}; + +static const char * const f17h_l2_mce_desc[] = { + "L2M tag multi-way-hit error", + "L2M tag ECC error", + "L2M data ECC error", + "HW assert", +}; + +static const char * const f17h_de_mce_desc[] = { + "uop cache tag parity error", + "uop cache data parity error", + "Insn buffer parity error", + "Insn dispatch queue parity error", + "Fetch address FIFO parity", + "Patch RAM data parity", + "Patch RAM sequencer parity", + "uop buffer parity" +}; + +static const char * const f17h_ex_mce_desc[] = { + "Watchdog timeout error", + "Phy register file parity", + "Flag register file parity", + "Immediate displacement register file parity", + "Address generator payload parity", + "EX payload parity", + "Checkpoint queue parity", + "Retire dispatch queue parity", +}; + +static const char * const f17h_fp_mce_desc[] = { + "Physical register file parity", + "Freelist parity error", + "Schedule queue parity", + "NSQ parity error", + "Retire queue parity", + "Status register file parity", +}; + +static const char * const f17h_l3_mce_desc[] = { + "Shadow tag macro ECC error", + "Shadow tag macro multi-way-hit error", + "L3M tag ECC error", + "L3M tag multi-way-hit error", + "L3M data ECC error", + "XI parity, L3 fill done channel error", + "L3 victim queue parity", + "L3 HW assert", +}; + +static const char * const f17h_cs_mce_desc[] = { + "Illegal request from transport layer", + "Address violation", + "Security violation", + "Illegal response from transport layer", + "Unexpected response", + "Parity error on incoming request or probe response data", + "Parity error on incoming read response data", + "Atomic request parity", + "ECC error on probe filter access", +}; + +static const char * const f17h_pie_mce_desc[] = { + "HW assert", + "Internal PIE register security violation", + "Error on GMI link", + "Poison data written to internal PIE register", +}; + +static const char * const f17h_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error on DRAM", + "SDP parity error", + "Advanced peripheral bus error", + "Command/address parity error", + "Write data CRC error", +}; + +static const char * const f17h_pb_mce_desc[] = { + "Parameter Block RAM ECC error", +}; + +static const char * const f17h_psp_mce_desc[] = { + "PSP RAM ECC or parity error", +}; + +static const char * const f17h_smu_mce_desc[] = { + "SMU RAM ECC or parity error", +}; + static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } +static void decode_f17h_core_errors(const char *ip_name, u8 xec, + unsigned int mca_type) +{ + const char * const *error_desc_array; + size_t len; + + pr_emerg(HW_ERR "%s Error: ", ip_name); + + switch (mca_type) { + case SMCA_LS: + error_desc_array = f17h_ls_mce_desc; + len = ARRAY_SIZE(f17h_ls_mce_desc) - 1; + + if (xec == 0x4) { + pr_cont("Unrecognized LS MCA error code.\n"); + return; + } + break; + + case SMCA_IF: + error_desc_array = f17h_if_mce_desc; + len = ARRAY_SIZE(f17h_if_mce_desc) - 1; + break; + + case SMCA_L2_CACHE: + error_desc_array = f17h_l2_mce_desc; + len = ARRAY_SIZE(f17h_l2_mce_desc) - 1; + break; + + case SMCA_DE: + error_desc_array = f17h_de_mce_desc; + len = ARRAY_SIZE(f17h_de_mce_desc) - 1; + break; + + case SMCA_EX: + error_desc_array = f17h_ex_mce_desc; + len = ARRAY_SIZE(f17h_ex_mce_desc) - 1; + break; + + case SMCA_FP: + error_desc_array = f17h_fp_mce_desc; + len = ARRAY_SIZE(f17h_fp_mce_desc) - 1; + break; + + case SMCA_L3_CACHE: + error_desc_array = f17h_l3_mce_desc; + len = ARRAY_SIZE(f17h_l3_mce_desc) - 1; + break; + + default: + pr_cont("Corrupted MCA core error info.\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", + amd_core_mcablock_names[mca_type]); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +static void decode_df_errors(u8 xec, unsigned int mca_type) +{ + const char * const *error_desc_array; + size_t len; + + pr_emerg(HW_ERR "Data Fabric Error: "); + + switch (mca_type) { + case SMCA_CS: + error_desc_array = f17h_cs_mce_desc; + len = ARRAY_SIZE(f17h_cs_mce_desc) - 1; + break; + + case SMCA_PIE: + error_desc_array = f17h_pie_mce_desc; + len = ARRAY_SIZE(f17h_pie_mce_desc) - 1; + break; + + default: + pr_cont("Corrupted MCA Data Fabric info.\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", + amd_df_mcablock_names[mca_type]); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +/* Decode errors according to Scalable MCA specification */ +static void decode_smca_errors(struct mce *m) +{ + u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); + unsigned int hwid, mca_type, i; + u8 xec = XEC(m->status, xec_mask); + const char * const *error_desc_array; + const char *ip_name; + u32 low, high; + size_t len; + + if (rdmsr_safe(addr, &low, &high)) { + pr_emerg("Invalid IP block specified, error information is unreliable.\n"); + return; + } + + hwid = high & MCI_IPID_HWID; + mca_type = (high & MCI_IPID_MCATYPE) >> 16; + + pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low); + + /* + * Based on hwid and mca_type values, decode errors from respective IPs. + * Note: mca_type values make sense only in the context of an hwid. + */ + for (i = 0; i < ARRAY_SIZE(amd_hwids); i++) + if (amd_hwids[i].hwid == hwid) + break; + + switch (i) { + case SMCA_F17H_CORE: + ip_name = (mca_type == SMCA_L3_CACHE) ? + "L3 Cache" : "F17h Core"; + return decode_f17h_core_errors(ip_name, xec, mca_type); + break; + + case SMCA_DF: + return decode_df_errors(xec, mca_type); + break; + + case SMCA_UMC: + error_desc_array = f17h_umc_mce_desc; + len = ARRAY_SIZE(f17h_umc_mce_desc) - 1; + break; + + case SMCA_PB: + error_desc_array = f17h_pb_mce_desc; + len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; + break; + + case SMCA_PSP: + error_desc_array = f17h_psp_mce_desc; + len = ARRAY_SIZE(f17h_psp_mce_desc) - 1; + break; + + case SMCA_SMU: + error_desc_array = f17h_smu_mce_desc; + len = ARRAY_SIZE(f17h_smu_mce_desc) - 1; + break; + + default: + pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid); + return; + } + + ip_name = amd_hwids[i].name; + pr_emerg(HW_ERR "%s Error: ", ip_name); + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", ip_name); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + static inline void amd_decode_err_code(u16 ec) { if (INT_ERROR(ec)) { @@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; + u32 ebx = cpuid_ebx(0x80000007); if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15 || c->x86 == 0x16) + if (c->x86 >= 0x15) pr_cont("|%s|%s", ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + if (!!(ebx & BIT(3))) { + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + + if (!rdmsr_safe(addr, &low, &high) && + (low & MCI_CONFIG_MCAX)) + pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + } + /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; if (ecc) @@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); + if (!!(ebx & BIT(3))) { + decode_smca_errors(m); + goto err_code; + } + if (!fam_ops) goto err_code; @@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + u32 ebx; if (c->x86_vendor != X86_VENDOR_AMD) return -ENODEV; @@ -888,10 +1204,18 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f16h_mc2_mce; break; + case 0x17: + ebx = cpuid_ebx(0x80000007); + xec_mask = 0x3f; + if (!(ebx & BIT(3))) { + printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n"); + goto err_out; + } + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); - kfree(fam_ops); - fam_ops = NULL; + goto err_out; } pr_info("MCE: In-kernel MCE decoding enabled.\n"); @@ -899,6 +1223,11 @@ static int __init mce_amd_init(void) mce_register_decode_chain(&amd_mce_dec_nb); return 0; + +err_out: + kfree(fam_ops); + fam_ops = NULL; + return -EINVAL; } early_initcall(mce_amd_init); -- cgit v1.2.3-59-g8ed1b From eb1af3b71f9d83e45f2fd2fd649356e98e1c582c Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 9 Mar 2016 16:40:48 -0800 Subject: EDAC/sb_edac: Fix computation of channel address Large memory Haswell-EX systems with multiple DIMMs per channel were sometimes reporting the wrong DIMM. Found three problems: 1) Debug printouts for socket and channel interleave were not interpreting the register fields correctly. The socket interleave field is a 2^X value (0=1, 1=2, 2=4, 3=8). The channel interleave is X+1 (0=1, 1=2, 2=3. 3=4). 2) Actual use of the socket interleave value didn't interpret as 2^X 3) Conversion of address to channel address was complicated, and wrong. Signed-off-by: Tony Luck Acked-by: Aristeu Rozanski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/edac/sb_edac.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index e438ee5b433f..34e54a313bf4 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci) edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n", n_tads, gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, - (u32)TAD_SOCK(reg), - (u32)TAD_CH(reg), + (u32)(1 << TAD_SOCK(reg)), + (u32)TAD_CH(reg) + 1, (u32)TAD_TGT0(reg), (u32)TAD_TGT1(reg), (u32)TAD_TGT2(reg), @@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, } ch_way = TAD_CH(reg) + 1; - sck_way = TAD_SOCK(reg) + 1; + sck_way = 1 << TAD_SOCK(reg); if (ch_way == 3) idx = addr >> 6; @@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, n_tads, addr, limit, - (u32)TAD_SOCK(reg), + sck_way, ch_way, offset, idx, @@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci, offset, addr); return -EINVAL; } - addr -= offset; - /* Store the low bits [0:6] of the addr */ - ch_addr = addr & 0x7f; - /* Remove socket wayness and remove 6 bits */ - addr >>= 6; - addr = div_u64(addr, sck_xch); -#if 0 - /* Divide by channel way */ - addr = addr / ch_way; -#endif - /* Recover the last 6 bits */ - ch_addr |= addr << 6; + + ch_addr = addr - offset; + ch_addr >>= (6 + shiftup); + ch_addr /= ch_way * sck_way; + ch_addr <<= (6 + shiftup); + ch_addr |= addr & ((1 << (6 + shiftup)) - 1); /* * Step 3) Decode rank -- cgit v1.2.3-59-g8ed1b