diff options
Diffstat (limited to 'arch/powerpc/kernel/eeh_driver.c')
-rw-r--r-- | arch/powerpc/kernel/eeh_driver.c | 205 |
1 files changed, 95 insertions, 110 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 0c0b66fc5bfb..b8a329f04814 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -207,18 +207,18 @@ static void *eeh_report_error(void *data, void *userdata) if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) return NULL; + + device_lock(&dev->dev); dev->error_state = pci_channel_io_frozen; driver = eeh_pcid_get(dev); - if (!driver) return NULL; + if (!driver) goto out_no_dev; eeh_disable_irq(dev); if (!driver->err_handler || - !driver->err_handler->error_detected) { - eeh_pcid_put(dev); - return NULL; - } + !driver->err_handler->error_detected) + goto out; rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); @@ -227,8 +227,12 @@ static void *eeh_report_error(void *data, void *userdata) if (*res == PCI_ERS_RESULT_NONE) *res = rc; edev->in_error = true; - eeh_pcid_put(dev); pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); + +out: + eeh_pcid_put(dev); +out_no_dev: + device_unlock(&dev->dev); return NULL; } @@ -251,15 +255,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata) if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) return NULL; + device_lock(&dev->dev); driver = eeh_pcid_get(dev); - if (!driver) return NULL; + if (!driver) goto out_no_dev; if (!driver->err_handler || !driver->err_handler->mmio_enabled || - (edev->mode & EEH_DEV_NO_HANDLER)) { - eeh_pcid_put(dev); - return NULL; - } + (edev->mode & EEH_DEV_NO_HANDLER)) + goto out; rc = driver->err_handler->mmio_enabled(dev); @@ -267,7 +270,10 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata) if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; if (*res == PCI_ERS_RESULT_NONE) *res = rc; +out: eeh_pcid_put(dev); +out_no_dev: + device_unlock(&dev->dev); return NULL; } @@ -290,20 +296,20 @@ static void *eeh_report_reset(void *data, void *userdata) if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) return NULL; + + device_lock(&dev->dev); dev->error_state = pci_channel_io_normal; driver = eeh_pcid_get(dev); - if (!driver) return NULL; + if (!driver) goto out_no_dev; eeh_enable_irq(dev); if (!driver->err_handler || !driver->err_handler->slot_reset || (edev->mode & EEH_DEV_NO_HANDLER) || - (!edev->in_error)) { - eeh_pcid_put(dev); - return NULL; - } + (!edev->in_error)) + goto out; rc = driver->err_handler->slot_reset(dev); if ((*res == PCI_ERS_RESULT_NONE) || @@ -311,7 +317,10 @@ static void *eeh_report_reset(void *data, void *userdata) if (*res == PCI_ERS_RESULT_DISCONNECT && rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; +out: eeh_pcid_put(dev); +out_no_dev: + device_unlock(&dev->dev); return NULL; } @@ -362,10 +371,12 @@ static void *eeh_report_resume(void *data, void *userdata) if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) return NULL; + + device_lock(&dev->dev); dev->error_state = pci_channel_io_normal; driver = eeh_pcid_get(dev); - if (!driver) return NULL; + if (!driver) goto out_no_dev; was_in_error = edev->in_error; edev->in_error = false; @@ -375,18 +386,20 @@ static void *eeh_report_resume(void *data, void *userdata) !driver->err_handler->resume || (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) { edev->mode &= ~EEH_DEV_NO_HANDLER; - eeh_pcid_put(dev); - return NULL; + goto out; } driver->err_handler->resume(dev); - eeh_pcid_put(dev); pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED); +out: + eeh_pcid_put(dev); #ifdef CONFIG_PCI_IOV if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); #endif +out_no_dev: + device_unlock(&dev->dev); return NULL; } @@ -406,23 +419,26 @@ static void *eeh_report_failure(void *data, void *userdata) if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe)) return NULL; + + device_lock(&dev->dev); dev->error_state = pci_channel_io_perm_failure; driver = eeh_pcid_get(dev); - if (!driver) return NULL; + if (!driver) goto out_no_dev; eeh_disable_irq(dev); if (!driver->err_handler || - !driver->err_handler->error_detected) { - eeh_pcid_put(dev); - return NULL; - } + !driver->err_handler->error_detected) + goto out; driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); - eeh_pcid_put(dev); pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); +out: + eeh_pcid_put(dev); +out_no_dev: + device_unlock(&dev->dev); return NULL; } @@ -619,17 +635,19 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) /** * eeh_reset_device - Perform actual reset of a pci slot + * @driver_eeh_aware: Does the device's driver provide EEH support? * @pe: EEH PE * @bus: PCI bus corresponding to the isolcated slot + * @rmv_data: Optional, list to record removed devices * * This routine must be called to do reset on the indicated PE. * During the reset, udev might be invoked because those affected * PCI devices will be removed and then added. */ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, - struct eeh_rmv_data *rmv_data) + struct eeh_rmv_data *rmv_data, + bool driver_eeh_aware) { - struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); time64_t tstamp; int cnt, rc; struct eeh_dev *edev; @@ -645,16 +663,12 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); - if (bus) { - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - } else { - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - } - } else if (frozen_bus) { + if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); + } else { + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); } /* @@ -689,8 +703,9 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * the device up before the scripts have taken it down, * potentially weird things happen. */ - if (bus) { - pr_info("EEH: Sleep 5s ahead of complete hotplug\n"); + if (!driver_eeh_aware || rmv_data->removed) { + pr_info("EEH: Sleep 5s ahead of %s hotplug\n", + (driver_eeh_aware ? "partial" : "complete")); ssleep(5); /* @@ -703,19 +718,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (pe->type & EEH_PE_VF) { eeh_add_virt_device(edev, NULL); } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + if (!driver_eeh_aware) + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); pci_hp_add_devices(bus); } - } else if (frozen_bus && rmv_data->removed) { - pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); - ssleep(5); - - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); - eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); - if (pe->type & EEH_PE_VF) - eeh_add_virt_device(edev, NULL); - else - pci_hp_add_devices(frozen_bus); } eeh_pe_state_clear(pe, EEH_PE_KEEP); @@ -733,28 +739,42 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, /** * eeh_handle_normal_event - Handle EEH events on a specific PE - * @pe: EEH PE + * @pe: EEH PE - which should not be used after we return, as it may + * have been invalidated. * * Attempts to recover the given PE. If recovery fails or the PE has failed * too many times, remove the PE. * - * Returns true if @pe should no longer be used, else false. + * While PHB detects address or data parity errors on particular PCI + * slot, the associated PE will be frozen. Besides, DMA's occurring + * to wild addresses (which usually happen due to bugs in device + * drivers or in PCI adapter firmware) can cause EEH error. #SERR, + * #PERR or other misc PCI-related errors also can trigger EEH errors. + * + * Recovery process consists of unplugging the device driver (which + * generated hotplug events to userspace), then issuing a PCI #RST to + * the device, then reconfiguring the PCI config space for all bridges + * & devices under this slot, and then finally restarting the device + * drivers (which cause a second set of hotplug events to go out to + * userspace). */ -static bool eeh_handle_normal_event(struct eeh_pe *pe) +void eeh_handle_normal_event(struct eeh_pe *pe) { - struct pci_bus *frozen_bus; + struct pci_bus *bus; struct eeh_dev *edev, *tmp; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; - frozen_bus = eeh_pe_bus_get(pe); - if (!frozen_bus) { + bus = eeh_pe_bus_get(pe); + if (!bus) { pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", __func__, pe->phb->global_number, pe->addr); - return false; + return; } + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > eeh_max_freezes) { @@ -806,7 +826,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe) */ if (result == PCI_ERS_RESULT_NONE) { pr_info("EEH: Reset with hotplug activity\n"); - rc = eeh_reset_device(pe, frozen_bus, NULL); + rc = eeh_reset_device(pe, bus, NULL, false); if (rc) { pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); @@ -858,7 +878,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe) /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { pr_info("EEH: Reset without hotplug activity\n"); - rc = eeh_reset_device(pe, NULL, &rmv_data); + rc = eeh_reset_device(pe, bus, &rmv_data, true); if (rc) { pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); @@ -891,7 +911,7 @@ static bool eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Notify device driver to resume\n"); eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); - return false; + goto final; hard_fail: /* @@ -916,23 +936,21 @@ hard_fail: * all removed devices correctly to avoid access * the their PCI config any more. */ - if (frozen_bus) { - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - - pci_lock_rescan_remove(); - pci_hp_remove_devices(frozen_bus); - pci_unlock_rescan_remove(); + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - /* The passed PE should no longer be used */ - return true; - } + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; } - return false; +final: + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } /** @@ -942,7 +960,7 @@ hard_fail: * specific PE. Iterates through possible failures and handles them as * necessary. */ -static void eeh_handle_special_event(void) +void eeh_handle_special_event(void) { struct eeh_pe *pe, *phb_pe; struct pci_bus *bus; @@ -1005,15 +1023,7 @@ static void eeh_handle_special_event(void) */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { - /* - * eeh_handle_normal_event() can make the PE stale if it - * determines that the PE cannot possibly be recovered. - * Don't modify the PE state if that's the case. - */ - if (eeh_handle_normal_event(pe)) - continue; - - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_handle_normal_event(pe); } else { pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { @@ -1049,28 +1059,3 @@ static void eeh_handle_special_event(void) break; } while (rc != EEH_NEXT_ERR_NONE); } - -/** - * eeh_handle_event - Reset a PCI device after hard lockup. - * @pe: EEH PE - * - * While PHB detects address or data parity errors on particular PCI - * slot, the associated PE will be frozen. Besides, DMA's occurring - * to wild addresses (which usually happen due to bugs in device - * drivers or in PCI adapter firmware) can cause EEH error. #SERR, - * #PERR or other misc PCI-related errors also can trigger EEH errors. - * - * Recovery process consists of unplugging the device driver (which - * generated hotplug events to userspace), then issuing a PCI #RST to - * the device, then reconfiguring the PCI config space for all bridges - * & devices under this slot, and then finally restarting the device - * drivers (which cause a second set of hotplug events to go out to - * userspace). - */ -void eeh_handle_event(struct eeh_pe *pe) -{ - if (pe) - eeh_handle_normal_event(pe); - else - eeh_handle_special_event(); -} |