aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjorn Helgaas <bhelgaas@google.com>2018-08-15 14:58:45 -0500
committerBjorn Helgaas <bhelgaas@google.com>2018-08-15 14:58:45 -0500
commit3c3ab37f4c03dc9c7c917ff3c1e71d6da81d3bd3 (patch)
treee5b3368d2615f937f928656387de24e4a3b86b1c
parentMerge branch 'for-linus' (diff)
parentPCI/AER: Don't clear AER bits if error handling is Firmware-First (diff)
downloadlinux-dev-3c3ab37f4c03dc9c7c917ff3c1e71d6da81d3bd3.tar.xz
linux-dev-3c3ab37f4c03dc9c7c917ff3c1e71d6da81d3bd3.zip
Merge branch 'pci/aer'
- Decode AER errors with names similar to "lspci" (Tyler Baicar) - Expose AER statistics in sysfs (Rajat Jain) - Clear AER status bits selectively based on the type of recovery (Oza Pawandeep) - Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST (Alexandru Gagniuc) - Don't clear AER status bits if we're using the "Firmware-First" strategy where firmware owns the registers (Alexandru Gagniuc) * pci/aer: PCI/AER: Don't clear AER bits if error handling is Firmware-First PCI/AER: Remove duplicate PCI_EXP_AER_FLAGS definition PCI/portdrv: Remove pcie_portdrv_err_handler.slot_reset PCI/AER: Clear device status bits during ERR_COR handling PCI/AER: Clear device status bits during ERR_FATAL and ERR_NONFATAL PCI/AER: Remove ERR_FATAL code from ERR_NONFATAL path PCI/AER: Factor out ERR_NONFATAL status bit clearing PCI/AER: Clear only ERR_NONFATAL bits during non-fatal recovery PCI/AER: Clear only ERR_FATAL status bits during fatal recovery PCI/AER: Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST PCI/AER: Add sysfs attributes for rootport cumulative stats PCI/AER: Add sysfs attributes to provide AER stats and breakdown PCI/AER: Define aer_stats structure for AER capable devices PCI/AER: Move internal declarations to drivers/pci/pci.h PCI/AER: Adopt lspci names for AER error decoding PCI/AER: Expose internal API for obtaining AER information # Conflicts: # drivers/pci/pci.h
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats122
-rw-r--r--Documentation/PCI/pcieaer-howto.txt5
-rw-r--r--drivers/pci/pci-sysfs.c3
-rw-r--r--drivers/pci/pci.h43
-rw-r--r--drivers/pci/pcie/aer.c336
-rw-r--r--drivers/pci/pcie/err.c15
-rw-r--r--drivers/pci/pcie/portdrv_pci.c25
-rw-r--r--drivers/pci/probe.c1
-rw-r--r--include/linux/pci.h5
9 files changed, 449 insertions, 106 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
new file mode 100644
index 000000000000..4b0318c99507
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
@@ -0,0 +1,122 @@
+==========================
+PCIe Device AER statistics
+==========================
+These attributes show up under all the devices that are AER capable. These
+statistical counters indicate the errors "as seen/reported by the device".
+Note that this may mean that if an endpoint is causing problems, the AER
+counters may increment at its link partner (e.g. root port) because the
+errors may be "seen" / reported by the link partner and not the
+problematic endpoint itself (which may report all counters as 0 as it never
+saw any problems).
+
+Where: /sys/bus/pci/devices/<dev>/aer_dev_correctable
+Date: July 2018
+Kernel Version: 4.19.0
+Contact: linux-pci@vger.kernel.org, rajatja@google.com
+Description: List of correctable errors seen and reported by this
+ PCI device using ERR_COR. Note that since multiple errors may
+ be reported using a single ERR_COR message, thus
+ TOTAL_ERR_COR at the end of the file may not match the actual
+ total of all the errors in the file. Sample output:
+-------------------------------------------------------------------------
+localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_correctable
+Receiver Error 2
+Bad TLP 0
+Bad DLLP 0
+RELAY_NUM Rollover 0
+Replay Timer Timeout 0
+Advisory Non-Fatal 0
+Corrected Internal Error 0
+Header Log Overflow 0
+TOTAL_ERR_COR 2
+-------------------------------------------------------------------------
+
+Where: /sys/bus/pci/devices/<dev>/aer_dev_fatal
+Date: July 2018
+Kernel Version: 4.19.0
+Contact: linux-pci@vger.kernel.org, rajatja@google.com
+Description: List of uncorrectable fatal errors seen and reported by this
+ PCI device using ERR_FATAL. Note that since multiple errors may
+ be reported using a single ERR_FATAL message, thus
+ TOTAL_ERR_FATAL at the end of the file may not match the actual
+ total of all the errors in the file. Sample output:
+-------------------------------------------------------------------------
+localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_fatal
+Undefined 0
+Data Link Protocol 0
+Surprise Down Error 0
+Poisoned TLP 0
+Flow Control Protocol 0
+Completion Timeout 0
+Completer Abort 0
+Unexpected Completion 0
+Receiver Overflow 0
+Malformed TLP 0
+ECRC 0
+Unsupported Request 0
+ACS Violation 0
+Uncorrectable Internal Error 0
+MC Blocked TLP 0
+AtomicOp Egress Blocked 0
+TLP Prefix Blocked Error 0
+TOTAL_ERR_FATAL 0
+-------------------------------------------------------------------------
+
+Where: /sys/bus/pci/devices/<dev>/aer_dev_nonfatal
+Date: July 2018
+Kernel Version: 4.19.0
+Contact: linux-pci@vger.kernel.org, rajatja@google.com
+Description: List of uncorrectable nonfatal errors seen and reported by this
+ PCI device using ERR_NONFATAL. Note that since multiple errors
+ may be reported using a single ERR_FATAL message, thus
+ TOTAL_ERR_NONFATAL at the end of the file may not match the
+ actual total of all the errors in the file. Sample output:
+-------------------------------------------------------------------------
+localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_nonfatal
+Undefined 0
+Data Link Protocol 0
+Surprise Down Error 0
+Poisoned TLP 0
+Flow Control Protocol 0
+Completion Timeout 0
+Completer Abort 0
+Unexpected Completion 0
+Receiver Overflow 0
+Malformed TLP 0
+ECRC 0
+Unsupported Request 0
+ACS Violation 0
+Uncorrectable Internal Error 0
+MC Blocked TLP 0
+AtomicOp Egress Blocked 0
+TLP Prefix Blocked Error 0
+TOTAL_ERR_NONFATAL 0
+-------------------------------------------------------------------------
+
+============================
+PCIe Rootport AER statistics
+============================
+These attributes show up under only the rootports (or root complex event
+collectors) that are AER capable. These indicate the number of error messages as
+"reported to" the rootport. Please note that the rootports also transmit
+(internally) the ERR_* messages for errors seen by the internal rootport PCI
+device, so these counters include them and are thus cumulative of all the error
+messages on the PCI hierarchy originating at that root port.
+
+Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
+Date: July 2018
+Kernel Version: 4.19.0
+Contact: linux-pci@vger.kernel.org, rajatja@google.com
+Description: Total number of ERR_COR messages reported to rootport.
+
+Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
+Date: July 2018
+Kernel Version: 4.19.0
+Contact: linux-pci@vger.kernel.org, rajatja@google.com
+Description: Total number of ERR_FATAL messages reported to rootport.
+
+Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
+Date: July 2018
+Kernel Version: 4.19.0
+Contact: linux-pci@vger.kernel.org, rajatja@google.com
+Description: Total number of ERR_NONFATAL messages reported to rootport.
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index acd0dddd6bb8..48ce7903e3c6 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -73,6 +73,11 @@ In the example, 'Requester ID' means the ID of the device who sends
the error message to root port. Pls. refer to pci express specs for
other fields.
+2.4 AER Statistics / Counters
+
+When PCIe AER errors are captured, the counters / statistics are also exposed
+in the form of sysfs attributes which are documented at
+Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
3. Developer Guide
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 0c4653c1d2ce..9f1cb9051d7d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1746,6 +1746,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
#endif
&pci_bridge_attr_group,
&pcie_dev_attr_group,
+#ifdef CONFIG_PCIEAER
+ &aer_stats_attr_group,
+#endif
NULL,
};
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 08817253c8a2..3ac0d99afe67 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -311,6 +311,34 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev)
return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
}
+#ifdef CONFIG_PCIEAER
+#include <linux/aer.h>
+
+#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
+
+struct aer_err_info {
+ struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
+ int error_dev_num;
+
+ unsigned int id:16;
+
+ unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
+ unsigned int __pad1:5;
+ unsigned int multi_error_valid:1;
+
+ unsigned int first_error:5;
+ unsigned int __pad2:2;
+ unsigned int tlp_header_valid:1;
+
+ unsigned int status; /* COR/UNCOR Error Status */
+ unsigned int mask; /* COR/UNCOR Error Mask */
+ struct aer_header_log_regs tlp; /* TLP Header */
+};
+
+int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
+#endif /* CONFIG_PCIEAER */
+
#ifdef CONFIG_PCI_ATS
void pci_restore_ats_state(struct pci_dev *dev);
#else
@@ -467,4 +495,19 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev,
}
#endif
+#ifdef CONFIG_PCIEAER
+void pci_no_aer(void);
+void pci_aer_init(struct pci_dev *dev);
+void pci_aer_exit(struct pci_dev *dev);
+extern const struct attribute_group aer_stats_attr_group;
+void pci_aer_clear_fatal_status(struct pci_dev *dev);
+void pci_aer_clear_device_status(struct pci_dev *dev);
+#else
+static inline void pci_no_aer(void) { }
+static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
+static inline void pci_aer_exit(struct pci_dev *d) { }
+static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
+static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
+#endif
+
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a2e88386af28..4e823ae051a7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -31,26 +31,9 @@
#include "portdrv.h"
#define AER_ERROR_SOURCES_MAX 100
-#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
-struct aer_err_info {
- struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
- int error_dev_num;
-
- unsigned int id:16;
-
- unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
- unsigned int __pad1:5;
- unsigned int multi_error_valid:1;
-
- unsigned int first_error:5;
- unsigned int __pad2:2;
- unsigned int tlp_header_valid:1;
-
- unsigned int status; /* COR/UNCOR Error Status */
- unsigned int mask; /* COR/UNCOR Error Mask */
- struct aer_header_log_regs tlp; /* TLP Header */
-};
+#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
+#define AER_MAX_TYPEOF_UNCOR_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/
struct aer_err_source {
unsigned int status;
@@ -76,6 +59,42 @@ struct aer_rpc {
*/
};
+/* AER stats for the device */
+struct aer_stats {
+
+ /*
+ * Fields for all AER capable devices. They indicate the errors
+ * "as seen by this device". Note that this may mean that if an
+ * end point is causing problems, the AER counters may increment
+ * at its link partner (e.g. root port) because the errors will be
+ * "seen" by the link partner and not the the problematic end point
+ * itself (which may report all counters as 0 as it never saw any
+ * problems).
+ */
+ /* Counters for different type of correctable errors */
+ u64 dev_cor_errs[AER_MAX_TYPEOF_COR_ERRS];
+ /* Counters for different type of fatal uncorrectable errors */
+ u64 dev_fatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
+ /* Counters for different type of nonfatal uncorrectable errors */
+ u64 dev_nonfatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS];
+ /* Total number of ERR_COR sent by this device */
+ u64 dev_total_cor_errs;
+ /* Total number of ERR_FATAL sent by this device */
+ u64 dev_total_fatal_errs;
+ /* Total number of ERR_NONFATAL sent by this device */
+ u64 dev_total_nonfatal_errs;
+
+ /*
+ * Fields for Root ports & root complex event collectors only, these
+ * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL
+ * messages received by the root port / event collector, INCLUDING the
+ * ones that are generated internally (by the rootport itself)
+ */
+ u64 rootport_total_cor_errs;
+ u64 rootport_total_fatal_errs;
+ u64 rootport_total_nonfatal_errs;
+};
+
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
PCI_ERR_UNC_ECRC| \
PCI_ERR_UNC_UNSUP| \
@@ -303,12 +322,13 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev)
if (!pci_is_pcie(dev))
return 0;
+ if (pcie_ports_native)
+ return 0;
+
if (!dev->__aer_firmware_first_valid)
aer_set_firmware_first(dev);
return dev->__aer_firmware_first;
}
-#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \
- PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
static bool aer_firmware_first;
@@ -323,6 +343,9 @@ bool aer_acpi_firmware_first(void)
.firmware_first = 0,
};
+ if (pcie_ports_native)
+ return false;
+
if (!parsed) {
apei_hest_parse(aer_hest_parse, &info);
aer_firmware_first = info.firmware_first;
@@ -357,16 +380,30 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
+void pci_aer_clear_device_status(struct pci_dev *dev)
+{
+ u16 sta;
+
+ pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
+ pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
+}
+
int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
{
int pos;
- u32 status;
+ u32 status, sev;
pos = dev->aer_cap;
if (!pos)
return -EIO;
+ if (pcie_aer_get_firmware_first(dev))
+ return -EIO;
+
+ /* Clear status bits for ERR_NONFATAL errors only */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
+ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
+ status &= ~sev;
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
@@ -374,6 +411,26 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
+void pci_aer_clear_fatal_status(struct pci_dev *dev)
+{
+ int pos;
+ u32 status, sev;
+
+ pos = dev->aer_cap;
+ if (!pos)
+ return;
+
+ if (pcie_aer_get_firmware_first(dev))
+ return;
+
+ /* Clear status bits for ERR_FATAL errors only */
+ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
+ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev);
+ status &= sev;
+ if (status)
+ pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+}
+
int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
{
int pos;
@@ -387,6 +444,9 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
if (!pos)
return -EIO;
+ if (pcie_aer_get_firmware_first(dev))
+ return -EIO;
+
port_type = pci_pcie_type(dev);
if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
@@ -402,10 +462,20 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
return 0;
}
-int pci_aer_init(struct pci_dev *dev)
+void pci_aer_init(struct pci_dev *dev)
{
dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
- return pci_cleanup_aer_error_status_regs(dev);
+
+ if (dev->aer_cap)
+ dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
+
+ pci_cleanup_aer_error_status_regs(dev);
+}
+
+void pci_aer_exit(struct pci_dev *dev)
+{
+ kfree(dev->aer_stats);
+ dev->aer_stats = NULL;
}
#define AER_AGENT_RECEIVER 0
@@ -458,52 +528,52 @@ static const char *aer_error_layer[] = {
"Transaction Layer"
};
-static const char *aer_correctable_error_string[] = {
- "Receiver Error", /* Bit Position 0 */
+static const char *aer_correctable_error_string[AER_MAX_TYPEOF_COR_ERRS] = {
+ "RxErr", /* Bit Position 0 */
NULL,
NULL,
NULL,
NULL,
NULL,
- "Bad TLP", /* Bit Position 6 */
- "Bad DLLP", /* Bit Position 7 */
- "RELAY_NUM Rollover", /* Bit Position 8 */
+ "BadTLP", /* Bit Position 6 */
+ "BadDLLP", /* Bit Position 7 */
+ "Rollover", /* Bit Position 8 */
NULL,
NULL,
NULL,
- "Replay Timer Timeout", /* Bit Position 12 */
- "Advisory Non-Fatal", /* Bit Position 13 */
- "Corrected Internal Error", /* Bit Position 14 */
- "Header Log Overflow", /* Bit Position 15 */
+ "Timeout", /* Bit Position 12 */
+ "NonFatalErr", /* Bit Position 13 */
+ "CorrIntErr", /* Bit Position 14 */
+ "HeaderOF", /* Bit Position 15 */
};
-static const char *aer_uncorrectable_error_string[] = {
+static const char *aer_uncorrectable_error_string[AER_MAX_TYPEOF_UNCOR_ERRS] = {
"Undefined", /* Bit Position 0 */
NULL,
NULL,
NULL,
- "Data Link Protocol", /* Bit Position 4 */
- "Surprise Down Error", /* Bit Position 5 */
+ "DLP", /* Bit Position 4 */
+ "SDES", /* Bit Position 5 */
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
- "Poisoned TLP", /* Bit Position 12 */
- "Flow Control Protocol", /* Bit Position 13 */
- "Completion Timeout", /* Bit Position 14 */
- "Completer Abort", /* Bit Position 15 */
- "Unexpected Completion", /* Bit Position 16 */
- "Receiver Overflow", /* Bit Position 17 */
- "Malformed TLP", /* Bit Position 18 */
+ "TLP", /* Bit Position 12 */
+ "FCP", /* Bit Position 13 */
+ "CmpltTO", /* Bit Position 14 */
+ "CmpltAbrt", /* Bit Position 15 */
+ "UnxCmplt", /* Bit Position 16 */
+ "RxOF", /* Bit Position 17 */
+ "MalfTLP", /* Bit Position 18 */
"ECRC", /* Bit Position 19 */
- "Unsupported Request", /* Bit Position 20 */
- "ACS Violation", /* Bit Position 21 */
- "Uncorrectable Internal Error", /* Bit Position 22 */
- "MC Blocked TLP", /* Bit Position 23 */
- "AtomicOp Egress Blocked", /* Bit Position 24 */
- "TLP Prefix Blocked Error", /* Bit Position 25 */
+ "UnsupReq", /* Bit Position 20 */
+ "ACSViol", /* Bit Position 21 */
+ "UncorrIntErr", /* Bit Position 22 */
+ "BlockedTLP", /* Bit Position 23 */
+ "AtomicOpBlocked", /* Bit Position 24 */
+ "TLPBlockedErr", /* Bit Position 25 */
};
static const char *aer_agent_string[] = {
@@ -513,6 +583,144 @@ static const char *aer_agent_string[] = {
"Transmitter ID"
};
+#define aer_stats_dev_attr(name, stats_array, strings_array, \
+ total_string, total_field) \
+ static ssize_t \
+ name##_show(struct device *dev, struct device_attribute *attr, \
+ char *buf) \
+{ \
+ unsigned int i; \
+ char *str = buf; \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ u64 *stats = pdev->aer_stats->stats_array; \
+ \
+ for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \
+ if (strings_array[i]) \
+ str += sprintf(str, "%s %llu\n", \
+ strings_array[i], stats[i]); \
+ else if (stats[i]) \
+ str += sprintf(str, #stats_array "_bit[%d] %llu\n",\
+ i, stats[i]); \
+ } \
+ str += sprintf(str, "TOTAL_%s %llu\n", total_string, \
+ pdev->aer_stats->total_field); \
+ return str-buf; \
+} \
+static DEVICE_ATTR_RO(name)
+
+aer_stats_dev_attr(aer_dev_correctable, dev_cor_errs,
+ aer_correctable_error_string, "ERR_COR",
+ dev_total_cor_errs);
+aer_stats_dev_attr(aer_dev_fatal, dev_fatal_errs,
+ aer_uncorrectable_error_string, "ERR_FATAL",
+ dev_total_fatal_errs);
+aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs,
+ aer_uncorrectable_error_string, "ERR_NONFATAL",
+ dev_total_nonfatal_errs);
+
+#define aer_stats_rootport_attr(name, field) \
+ static ssize_t \
+ name##_show(struct device *dev, struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ return sprintf(buf, "%llu\n", pdev->aer_stats->field); \
+} \
+static DEVICE_ATTR_RO(name)
+
+aer_stats_rootport_attr(aer_rootport_total_err_cor,
+ rootport_total_cor_errs);
+aer_stats_rootport_attr(aer_rootport_total_err_fatal,
+ rootport_total_fatal_errs);
+aer_stats_rootport_attr(aer_rootport_total_err_nonfatal,
+ rootport_total_nonfatal_errs);
+
+static struct attribute *aer_stats_attrs[] __ro_after_init = {
+ &dev_attr_aer_dev_correctable.attr,
+ &dev_attr_aer_dev_fatal.attr,
+ &dev_attr_aer_dev_nonfatal.attr,
+ &dev_attr_aer_rootport_total_err_cor.attr,
+ &dev_attr_aer_rootport_total_err_fatal.attr,
+ &dev_attr_aer_rootport_total_err_nonfatal.attr,
+ NULL
+};
+
+static umode_t aer_stats_attrs_are_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ if (!pdev->aer_stats)
+ return 0;
+
+ if ((a == &dev_attr_aer_rootport_total_err_cor.attr ||
+ a == &dev_attr_aer_rootport_total_err_fatal.attr ||
+ a == &dev_attr_aer_rootport_total_err_nonfatal.attr) &&
+ pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT)
+ return 0;
+
+ return a->mode;
+}
+
+const struct attribute_group aer_stats_attr_group = {
+ .attrs = aer_stats_attrs,
+ .is_visible = aer_stats_attrs_are_visible,
+};
+
+static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
+ struct aer_err_info *info)
+{
+ int status, i, max = -1;
+ u64 *counter = NULL;
+ struct aer_stats *aer_stats = pdev->aer_stats;
+
+ if (!aer_stats)
+ return;
+
+ switch (info->severity) {
+ case AER_CORRECTABLE:
+ aer_stats->dev_total_cor_errs++;
+ counter = &aer_stats->dev_cor_errs[0];
+ max = AER_MAX_TYPEOF_COR_ERRS;
+ break;
+ case AER_NONFATAL:
+ aer_stats->dev_total_nonfatal_errs++;
+ counter = &aer_stats->dev_nonfatal_errs[0];
+ max = AER_MAX_TYPEOF_UNCOR_ERRS;
+ break;
+ case AER_FATAL:
+ aer_stats->dev_total_fatal_errs++;
+ counter = &aer_stats->dev_fatal_errs[0];
+ max = AER_MAX_TYPEOF_UNCOR_ERRS;
+ break;
+ }
+
+ status = (info->status & ~info->mask);
+ for (i = 0; i < max; i++)
+ if (status & (1 << i))
+ counter[i]++;
+}
+
+static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
+ struct aer_err_source *e_src)
+{
+ struct aer_stats *aer_stats = pdev->aer_stats;
+
+ if (!aer_stats)
+ return;
+
+ if (e_src->status & PCI_ERR_ROOT_COR_RCV)
+ aer_stats->rootport_total_cor_errs++;
+
+ if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
+ if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
+ aer_stats->rootport_total_fatal_errs++;
+ else
+ aer_stats->rootport_total_nonfatal_errs++;
+ }
+}
+
static void __print_tlp_header(struct pci_dev *dev,
struct aer_header_log_regs *t)
{
@@ -545,9 +753,10 @@ static void __aer_print_error(struct pci_dev *dev,
pci_err(dev, " [%2d] Unknown Error Bit%s\n",
i, info->first_error == i ? " (First)" : "");
}
+ pci_dev_aer_stats_incr(dev, info);
}
-static void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
{
int layer, agent;
int id = ((dev->bus->number << 8) | dev->devfn);
@@ -799,6 +1008,7 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
+ pci_aer_clear_device_status(dev);
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
@@ -876,7 +1086,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
#endif
/**
- * get_device_error_info - read error status from dev and store it to info
+ * aer_get_device_error_info - read error status from dev and store it to info
* @dev: pointer to the device expected to have a error record
* @info: pointer to structure to store the error record
*
@@ -884,7 +1094,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
*
* Note that @info is reused among all error devices. Clear fields properly.
*/
-static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
+int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
{
int pos, temp;
@@ -942,11 +1152,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
/* Report all before handle them, not to lost records by reset etc. */
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
- if (get_device_error_info(e_info->dev[i], e_info))
+ if (aer_get_device_error_info(e_info->dev[i], e_info))
aer_print_error(e_info->dev[i], e_info);
}
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
- if (get_device_error_info(e_info->dev[i], e_info))
+ if (aer_get_device_error_info(e_info->dev[i], e_info))
handle_error_source(e_info->dev[i], e_info);
}
}
@@ -962,6 +1172,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
struct pci_dev *pdev = rpc->rpd;
struct aer_err_info *e_info = &rpc->e_info;
+ pci_rootport_aer_stats_incr(pdev, e_src);
+
/*
* There is a possibility that both correctable error and
* uncorrectable error being logged. Report correctable error first.
@@ -1336,20 +1548,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
*/
static void aer_error_resume(struct pci_dev *dev)
{
- int pos;
- u32 status, mask;
- u16 reg16;
-
- /* Clean up Root device status */
- pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &reg16);
- pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
-
- /* Clean AER Root Error Status */
- pos = dev->aer_cap;
- pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status);
- pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &mask);
- status &= ~mask; /* Clear corresponding nonfatal bits */
- pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+ pci_aer_clear_device_status(dev);
+ pci_cleanup_aer_uncorrect_error_status(dev);
}
static struct pcie_port_service_driver aerdriver = {
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f02e334beb45..674984a9277a 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
dev->error_state = state;
pci_walk_bus(dev->subordinate, cb, &result_data);
if (cb == report_resume) {
+ pci_aer_clear_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
dev->error_state = pci_channel_io_normal;
}
@@ -259,15 +260,10 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
/*
* If the error is reported by an end point, we think this
* error is related to the upstream link of the end point.
+ * The error is non fatal so the bus is ok; just invoke
+ * the callback for the function that logged the error.
*/
- if (state == pci_channel_io_normal)
- /*
- * the error is non fatal so the bus is ok, just invoke
- * the callback for the function that logged the error.
- */
- cb(dev, &result_data);
- else
- pci_walk_bus(dev->bus, cb, &result_data);
+ cb(dev, &result_data);
}
return result_data.result;
@@ -317,7 +313,8 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
* do error recovery on all subordinates of the bridge instead
* of the bridge and clear the error status of the bridge.
*/
- pci_cleanup_aer_uncorrect_error_status(dev);
+ pci_aer_clear_fatal_status(dev);
+ pci_aer_clear_device_status(dev);
}
if (result == PCI_ERS_RESULT_RECOVERED) {
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 973f1b80a038..b78840f54a9b 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
/* global data */
-static int pcie_portdrv_restore_config(struct pci_dev *dev)
-{
- int retval;
-
- retval = pci_enable_device(dev);
- if (retval)
- return retval;
- pci_set_master(dev);
- return 0;
-}
-
#ifdef CONFIG_PM
static int pcie_port_runtime_suspend(struct device *dev)
{
@@ -160,19 +149,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev)
return PCI_ERS_RESULT_RECOVERED;
}
-static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
-{
- /* If fatal, restore cfg space for possible link reset at upstream */
- if (dev->error_state == pci_channel_io_frozen) {
- dev->state_saved = true;
- pci_restore_state(dev);
- pcie_portdrv_restore_config(dev);
- pci_enable_pcie_error_reporting(dev);
- }
-
- return PCI_ERS_RESULT_RECOVERED;
-}
-
static int resume_iter(struct device *device, void *data)
{
struct pcie_device *pcie_device;
@@ -208,7 +184,6 @@ static const struct pci_device_id port_pci_ids[] = { {
static const struct pci_error_handlers pcie_portdrv_err_handler = {
.error_detected = pcie_portdrv_error_detected,
.mmio_enabled = pcie_portdrv_mmio_enabled,
- .slot_reset = pcie_portdrv_slot_reset,
.resume = pcie_portdrv_err_resume,
};
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 611adcd9c169..9472da27e202 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev)
static void pci_release_capabilities(struct pci_dev *dev)
{
+ pci_aer_exit(dev);
pci_vpd_release(dev);
pci_iov_release(dev);
pci_free_cap_save_buffers(dev);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c133ccfa002e..d78f46f070c2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -299,6 +299,7 @@ struct pci_dev {
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
#ifdef CONFIG_PCIEAER
u16 aer_cap; /* AER capability offset */
+ struct aer_stats *aer_stats; /* AER stats for this device */
#endif
u8 pcie_cap; /* PCIe capability offset */
u8 msi_cap; /* MSI capability offset */
@@ -1469,13 +1470,9 @@ static inline bool pcie_aspm_support_enabled(void) { return false; }
#endif
#ifdef CONFIG_PCIEAER
-void pci_no_aer(void);
bool pci_aer_available(void);
-int pci_aer_init(struct pci_dev *dev);
#else
-static inline void pci_no_aer(void) { }
static inline bool pci_aer_available(void) { return false; }
-static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; }
#endif
#ifdef CONFIG_PCIE_ECRC