1 files changed, 195 insertions, 154 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 5b51079f3e3b..8ad0c5b891f4 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -42,11 +42,19 @@ static int ioda_eeh_event(struct notifier_block *nb,
 {
 	uint64_t changed_evts = (uint64_t)change;
 
-	/* We simply send special EEH event */
-	if ((changed_evts & OPAL_EVENT_PCI_ERROR) &&
-	    (events & OPAL_EVENT_PCI_ERROR) &&
-	    eeh_enabled())
+	/*
+	 * We simply send special EEH event if EEH has
+	 * been enabled, or clear pending events in
+	 * case that we enable EEH soon
+	 */
+	if (!(changed_evts & OPAL_EVENT_PCI_ERROR) ||
+	    !(events & OPAL_EVENT_PCI_ERROR))
+		return 0;
+
+	if (eeh_enabled())
 		eeh_send_failure_event(NULL);
+	else
+		opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
 	return 0;
 }
@@ -141,7 +149,9 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 	}
 
 #ifdef CONFIG_DEBUG_FS
-	if (phb->dbgfs) {
+	if (!phb->has_dbgfs && phb->dbgfs) {
+		phb->has_dbgfs = 1;
+
 		debugfs_create_file("err_injct_outbound", 0600,
 				    phb->dbgfs, hose,
 				    &ioda_eeh_outb_dbgfs_ops);
@@ -154,7 +164,14 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 	}
 #endif
 
-	phb->eeh_state |= PNV_EEH_STATE_ENABLED;
+	/* If EEH is enabled, we're going to rely on that.
+	 * Otherwise, we restore to conventional mechanism
+	 * to clear frozen PE during PCI config access.
+	 */
+	if (eeh_enabled())
+		phb->flags |= PNV_PHB_FLAG_EEH;
+	else
+		phb->flags &= ~PNV_PHB_FLAG_EEH;
 
 	return 0;
 }
@@ -250,7 +267,7 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
 {
 	s64 ret = 0;
 	u8 fstate;
-	u16 pcierr;
+	__be16 pcierr;
 	u32 pe_no;
 	int result;
 	struct pci_controller *hose = pe->phb;
@@ -268,6 +285,21 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
 		return EEH_STATE_NOT_SUPPORT;
 	}
 
+	/*
+	 * If we're in middle of PE reset, return normal
+	 * state to keep EEH core going. For PHB reset, we
+	 * still expect to have fenced PHB cleared with
+	 * PHB reset.
+	 */
+	if (!(pe->type & EEH_PE_PHB) &&
+	    (pe->state & EEH_PE_RESET)) {
+		result = (EEH_STATE_MMIO_ACTIVE |
+			  EEH_STATE_DMA_ACTIVE |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+		return result;
+	}
+
 	/* Retrieve PE status through OPAL */
 	pe_no = pe->addr;
 	ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
@@ -284,7 +316,7 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
 		result = 0;
 		result &= ~EEH_STATE_RESET_ACTIVE;
 
-		if (pcierr != OPAL_EEH_PHB_ERROR) {
+		if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) {
 			result |= EEH_STATE_MMIO_ACTIVE;
 			result |= EEH_STATE_DMA_ACTIVE;
 			result |= EEH_STATE_MMIO_ENABLED;
@@ -347,52 +379,6 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
 	return result;
 }
 
-static int ioda_eeh_pe_clear(struct eeh_pe *pe)
-{
-	struct pci_controller *hose;
-	struct pnv_phb *phb;
-	u32 pe_no;
-	u8 fstate;
-	u16 pcierr;
-	s64 ret;
-
-	pe_no = pe->addr;
-	hose = pe->phb;
-	phb = pe->phb->private_data;
-
-	/* Clear the EEH error on the PE */
-	ret = opal_pci_eeh_freeze_clear(phb->opal_id,
-			pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-	if (ret) {
-		pr_err("%s: Failed to clear EEH error for "
-		       "PHB#%x-PE#%x, err=%lld\n",
-		       __func__, hose->global_number, pe_no, ret);
-		return -EIO;
-	}
-
-	/*
-	 * Read the PE state back and verify that the frozen
-	 * state has been removed.
-	 */
-	ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
-			&fstate, &pcierr, NULL);
-	if (ret) {
-		pr_err("%s: Failed to get EEH status on "
-		       "PHB#%x-PE#%x\n, err=%lld\n",
-		       __func__, hose->global_number, pe_no, ret);
-		return -EIO;
-	}
-
-	if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) {
-		pr_err("%s: Frozen state not cleared on "
-		       "PHB#%x-PE#%x, sts=%x\n",
-		       __func__, hose->global_number, pe_no, fstate);
-		return -EIO;
-	}
-
-	return 0;
-}
-
 static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
 {
 	s64 rc = OPAL_HARDWARE;
@@ -402,13 +388,16 @@ static s64 ioda_eeh_phb_poll(struct pnv_phb *phb)
 		if (rc <= 0)
 			break;
 
-		msleep(rc);
+		if (system_state < SYSTEM_RUNNING)
+			udelay(1000 * rc);
+		else
+			msleep(rc);
 	}
 
 	return rc;
 }
 
-static int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
+int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
 {
 	struct pnv_phb *phb = hose->private_data;
 	s64 rc = OPAL_HARDWARE;
@@ -431,9 +420,17 @@ static int ioda_eeh_phb_reset(struct pci_controller *hose, int option)
 
 	/*
 	 * Poll state of the PHB until the request is done
-	 * successfully.
+	 * successfully. The PHB reset is usually PHB complete
+	 * reset followed by hot reset on root bus. So we also
+	 * need the PCI bus settlement delay.
 	 */
 	rc = ioda_eeh_phb_poll(phb);
+	if (option == EEH_RESET_DEACTIVATE) {
+		if (system_state < SYSTEM_RUNNING)
+			udelay(1000 * EEH_PE_RST_SETTLE_TIME);
+		else
+			msleep(EEH_PE_RST_SETTLE_TIME);
+	}
 out:
 	if (rc != OPAL_SUCCESS)
 		return -EIO;
@@ -471,6 +468,8 @@ static int ioda_eeh_root_reset(struct pci_controller *hose, int option)
 
 	/* Poll state of the PHB until the request is done */
 	rc = ioda_eeh_phb_poll(phb);
+	if (option == EEH_RESET_DEACTIVATE)
+		msleep(EEH_PE_RST_SETTLE_TIME);
 out:
 	if (rc != OPAL_SUCCESS)
 		return -EIO;
@@ -478,32 +477,71 @@ out:
 	return 0;
 }
 
-static int ioda_eeh_bridge_reset(struct pci_controller *hose,
-		struct pci_dev *dev, int option)
+static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option)
+
 {
-	u16 ctrl;
+	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	int aer = edev ? edev->aer_cap : 0;
+	u32 ctrl;
 
-	pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n",
-		 __func__, hose->global_number, dev->bus->number,
-		 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option);
+	pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+		 __func__, pci_domain_nr(dev->bus),
+		 dev->bus->number, option);
 
 	switch (option) {
 	case EEH_RESET_FUNDAMENTAL:
 	case EEH_RESET_HOT:
-		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+		/* Don't report linkDown event */
+		if (aer) {
+			eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK,
+					     4, &ctrl);
+			ctrl |= PCI_ERR_UNC_SURPDN;
+                        eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK,
+					      4, ctrl);
+                }
+
+		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl);
 		ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
-		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+		eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl);
+		msleep(EEH_PE_RST_HOLD_TIME);
+
 		break;
 	case EEH_RESET_DEACTIVATE:
-		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+		eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl);
 		ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
-		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+		eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl);
+		msleep(EEH_PE_RST_SETTLE_TIME);
+
+		/* Continue reporting linkDown event */
+		if (aer) {
+			eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK,
+					     4, &ctrl);
+			ctrl &= ~PCI_ERR_UNC_SURPDN;
+			eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK,
+					      4, ctrl);
+		}
+
 		break;
 	}
 
 	return 0;
 }
 
+void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
+{
+	struct pci_controller *hose;
+
+	if (pci_is_root_bus(dev->bus)) {
+		hose = pci_bus_to_host(dev->bus);
+		ioda_eeh_root_reset(hose, EEH_RESET_HOT);
+		ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE);
+	} else {
+		ioda_eeh_bridge_reset(dev, EEH_RESET_HOT);
+		ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
+	}
+}
+
 /**
  * ioda_eeh_reset - Reset the indicated PE
  * @pe: EEH PE
@@ -523,27 +561,18 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 	int ret;
 
 	/*
-	 * Anyway, we have to clear the problematic state for the
-	 * corresponding PE. However, we needn't do it if the PE
-	 * is PHB associated. That means the PHB is having fatal
-	 * errors and it needs reset. Further more, the AIB interface
-	 * isn't reliable any more.
-	 */
-	if (!(pe->type & EEH_PE_PHB) &&
-	    (option == EEH_RESET_HOT ||
-	    option == EEH_RESET_FUNDAMENTAL)) {
-		ret = ioda_eeh_pe_clear(pe);
-		if (ret)
-			return -EIO;
-	}
-
-	/*
-	 * The rules applied to reset, either fundamental or hot reset:
+	 * For PHB reset, we always have complete reset. For those PEs whose
+	 * primary bus derived from root complex (root bus) or root port
+	 * (usually bus#1), we apply hot or fundamental reset on the root port.
+	 * For other PEs, we always have hot reset on the PE primary bus.
 	 *
-	 * We always reset the direct upstream bridge of the PE. If the
-	 * direct upstream bridge isn't root bridge, we always take hot
-	 * reset no matter what option (fundamental or hot) is. Otherwise,
-	 * we should do the reset according to the required option.
+	 * Here, we have different design to pHyp, which always clear the
+	 * frozen state during PE reset. However, the good idea here from
+	 * benh is to keep frozen state before we get PE reset done completely
+	 * (until BAR restore). With the frozen state, HW drops illegal IO
+	 * or MMIO access, which can incur recrusive frozen PE during PE
+	 * reset. The side effect is that EEH core has to clear the frozen
+	 * state explicitly after BAR restore.
 	 */
 	if (pe->type & EEH_PE_PHB) {
 		ret = ioda_eeh_phb_reset(hose, option);
@@ -553,7 +582,7 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 		    pci_is_root_bus(bus->parent))
 			ret = ioda_eeh_root_reset(hose, option);
 		else
-			ret = ioda_eeh_bridge_reset(hose, bus->self, option);
+			ret = ioda_eeh_bridge_reset(bus->self, option);
 	}
 
 	return ret;
@@ -640,22 +669,6 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose)
 	}
 }
 
-static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
-			       struct eeh_pe **pe)
-{
-	struct eeh_pe *phb_pe;
-
-	phb_pe = eeh_phb_pe_get(hose);
-	if (!phb_pe) {
-		pr_warning("%s Can't find PE for PHB#%d\n",
-			   __func__, hose->global_number);
-		return -EEXIST;
-	}
-
-	*pe = phb_pe;
-	return 0;
-}
-
 static int ioda_eeh_get_pe(struct pci_controller *hose,
 			   u16 pe_no, struct eeh_pe **pe)
 {
@@ -663,7 +676,8 @@ static int ioda_eeh_get_pe(struct pci_controller *hose,
 	struct eeh_dev dev;
 
 	/* Find the PHB PE */
-	if (ioda_eeh_get_phb_pe(hose, &phb_pe))
+	phb_pe = eeh_phb_pe_get(hose);
+	if (!phb_pe)
 		return -EEXIST;
 
 	/* Find the PE according to PE# */
@@ -691,26 +705,30 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
-	u64 frozen_pe_no;
-	u16 err_type, severity;
+	struct eeh_pe *phb_pe, *parent_pe;
+	__be64 frozen_pe_no;
+	__be16 err_type, severity;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	long rc;
-	int ret = EEH_NEXT_ERR_NONE;
+	int state, ret = EEH_NEXT_ERR_NONE;
 
 	/*
 	 * While running here, it's safe to purge the event queue.
 	 * And we should keep the cached OPAL notifier event sychronized
 	 * between the kernel and firmware.
 	 */
-	eeh_remove_event(NULL);
+	eeh_remove_event(NULL, false);
 	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
 	list_for_each_entry(hose, &hose_list, list_node) {
 		/*
 		 * If the subordinate PCI buses of the PHB has been
-		 * removed, we needn't take care of it any more.
+		 * removed or is exactly under error recovery, we
+		 * needn't take care of it any more.
 		 */
 		phb = hose->private_data;
-		if (phb->eeh_state & PNV_EEH_STATE_REMOVED)
+		phb_pe = eeh_phb_pe_get(hose);
+		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
 			continue;
 
 		rc = opal_pci_next_error(phb->opal_id,
@@ -725,8 +743,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		}
 
 		/* If the PHB doesn't have error, stop processing */
-		if (err_type == OPAL_EEH_NO_ERROR ||
-		    severity == OPAL_EEH_SEV_NO_ERROR) {
+		if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
+		    be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
 			pr_devel("%s: No error found on PHB#%x\n",
 				 __func__, hose->global_number);
 			continue;
@@ -738,20 +756,14 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		 * specific PHB.
 		 */
 		pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
-			 __func__, err_type, severity,
-			 frozen_pe_no, hose->global_number);
-		switch (err_type) {
+			 __func__, be16_to_cpu(err_type), be16_to_cpu(severity),
+			 be64_to_cpu(frozen_pe_no), hose->global_number);
+		switch (be16_to_cpu(err_type)) {
 		case OPAL_EEH_IOC_ERROR:
-			if (severity == OPAL_EEH_SEV_IOC_DEAD) {
-				list_for_each_entry(hose, &hose_list,
-						    list_node) {
-					phb = hose->private_data;
-					phb->eeh_state |= PNV_EEH_STATE_REMOVED;
-				}
-
+			if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
 				pr_err("EEH: dead IOC detected\n");
 				ret = EEH_NEXT_ERR_DEAD_IOC;
-			} else if (severity == OPAL_EEH_SEV_INF) {
+			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
 				pr_info("EEH: IOC informative error "
 					"detected\n");
 				ioda_eeh_hub_diag(hose);
@@ -760,25 +772,26 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 
 			break;
 		case OPAL_EEH_PHB_ERROR:
-			if (severity == OPAL_EEH_SEV_PHB_DEAD) {
-				if (ioda_eeh_get_phb_pe(hose, pe))
-					break;
-
-				pr_err("EEH: dead PHB#%x detected\n",
-					hose->global_number);
-				phb->eeh_state |= PNV_EEH_STATE_REMOVED;
+			if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
+				*pe = phb_pe;
+				pr_err("EEH: dead PHB#%x detected, "
+				       "location: %s\n",
+				       hose->global_number,
+				       eeh_pe_loc_get(phb_pe));
 				ret = EEH_NEXT_ERR_DEAD_PHB;
-			} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
-				if (ioda_eeh_get_phb_pe(hose, pe))
-					break;
-
-				pr_err("EEH: fenced PHB#%x detected\n",
-					hose->global_number);
+			} else if (be16_to_cpu(severity) ==
+						OPAL_EEH_SEV_PHB_FENCED) {
+				*pe = phb_pe;
+				pr_err("EEH: Fenced PHB#%x detected, "
+				       "location: %s\n",
+				       hose->global_number,
+				       eeh_pe_loc_get(phb_pe));
 				ret = EEH_NEXT_ERR_FENCED_PHB;
-			} else if (severity == OPAL_EEH_SEV_INF) {
+			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
 				pr_info("EEH: PHB#%x informative error "
-					"detected\n",
-					hose->global_number);
+					"detected, location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
 				ioda_eeh_phb_diag(hose);
 				ret = EEH_NEXT_ERR_NONE;
 			}
@@ -786,30 +799,33 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			break;
 		case OPAL_EEH_PE_ERROR:
 			/*
-			 * If we can't find the corresponding PE, the
-			 * PEEV / PEST would be messy. So we force an
-			 * fenced PHB so that it can be recovered.
+			 * If we can't find the corresponding PE, we
+			 * just try to unfreeze.
 			 */
-			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) {
-				if (!ioda_eeh_get_phb_pe(hose, pe)) {
-					pr_err("EEH: Escalated fenced PHB#%x "
-					       "detected for PE#%llx\n",
-						hose->global_number,
-						frozen_pe_no);
-					ret = EEH_NEXT_ERR_FENCED_PHB;
-				} else {
-					ret = EEH_NEXT_ERR_NONE;
-				}
+			if (ioda_eeh_get_pe(hose,
+					    be64_to_cpu(frozen_pe_no), pe)) {
+				/* Try best to clear it */
+				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+					hose->global_number, frozen_pe_no);
+				pr_info("EEH: PHB location: %s\n",
+					eeh_pe_loc_get(phb_pe));
+				opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
+					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+				ret = EEH_NEXT_ERR_NONE;
+			} else if ((*pe)->state & EEH_PE_ISOLATED) {
+				ret = EEH_NEXT_ERR_NONE;
 			} else {
 				pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
 					(*pe)->addr, (*pe)->phb->global_number);
+				pr_err("EEH: PE location: %s, PHB location: %s\n",
+					eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe));
 				ret = EEH_NEXT_ERR_FROZEN_PE;
 			}
 
 			break;
 		default:
 			pr_warn("%s: Unexpected error type %d\n",
-				__func__, err_type);
+				__func__, be16_to_cpu(err_type));
 		}
 
 		/*
@@ -827,6 +843,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		}
 
 		/*
+		 * We probably have the frozen parent PE out there and
+		 * we need have to handle frozen parent PE firstly.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			parent_pe = (*pe)->parent;
+			while (parent_pe) {
+				/* Hit the ceiling ? */
+				if (parent_pe->type & EEH_PE_PHB)
+					break;
+
+				/* Frozen parent PE ? */
+				state = ioda_eeh_get_state(parent_pe);
+				if (state > 0 &&
+				    (state & active_flags) != active_flags)
+					*pe = parent_pe;
+
+				/* Next parent level */
+				parent_pe = parent_pe->parent;
+			}
+
+			/* We possibly migrate to another PE */
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+		}
+
+		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
 		 * Otherwise, we need actions to be taken by upper