diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
28 files changed, 3612 insertions, 939 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 9fced3f6d2dc..45a8ed0585cd 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -11,11 +11,13 @@ config PPC_POWERNV select PPC_UDBG_16550 select PPC_SCOM select ARCH_RANDOM - default y - -config POWERNV_MSI - bool "Support PCI MSI on PowerNV platform" - depends on PCI_MSI + select CPU_FREQ + select CPU_FREQ_GOV_PERFORMANCE + select CPU_FREQ_GOV_POWERSAVE + select CPU_FREQ_GOV_USERSPACE + select CPU_FREQ_GOV_ONDEMAND + select CPU_FREQ_GOV_CONSERVATIVE + select PPC_DOORBELL default y config PPC_POWERNV_RTAS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 873fa1370dc4..4ad227d04c1a 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,8 +1,10 @@ -obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o +obj-y += setup.o opal-wrappers.o opal.o opal-async.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o -obj-y += rng.o +obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o +obj-y += opal-msglog.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o +obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index d7ddcee7feb8..8ad0c5b891f4 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -14,7 +14,6 @@ #include <linux/bootmem.h> #include <linux/debugfs.h> #include <linux/delay.h> -#include <linux/init.h> #include <linux/io.h> #include <linux/irq.h> #include <linux/kernel.h> @@ -43,10 +42,19 @@ static int ioda_eeh_event(struct notifier_block *nb, { uint64_t changed_evts = (uint64_t)change; - /* We simply send special EEH event */ - if ((changed_evts & OPAL_EVENT_PCI_ERROR) && - (events & OPAL_EVENT_PCI_ERROR)) + /* + * We simply send special EEH event if EEH has + * been enabled, or clear pending events in + * case that we enable EEH soon + */ + if (!(changed_evts & OPAL_EVENT_PCI_ERROR) || + !(events & OPAL_EVENT_PCI_ERROR)) + return 0; + + if (eeh_enabled()) eeh_send_failure_event(NULL); + else + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); return 0; } @@ -114,6 +122,7 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get, ioda_eeh_inbB_dbgfs_set, "0x%llx\n"); #endif /* CONFIG_DEBUG_FS */ + /** * ioda_eeh_post_init - Chip dependent post initialization * @hose: PCI controller @@ -140,7 +149,9 @@ static int ioda_eeh_post_init(struct pci_controller *hose) } #ifdef CONFIG_DEBUG_FS - if (phb->dbgfs) { + if (!phb->has_dbgfs && phb->dbgfs) { + phb->has_dbgfs = 1; + debugfs_create_file("err_injct_outbound", 0600, phb->dbgfs, hose, &ioda_eeh_outb_dbgfs_ops); @@ -153,7 +164,14 @@ static int ioda_eeh_post_init(struct pci_controller *hose) } #endif - phb->eeh_state |= PNV_EEH_STATE_ENABLED; + /* If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; return 0; } @@ -221,6 +239,22 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int option) return ret; } +static void ioda_eeh_phb_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + long rc; + + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, + PNV_PCI_DIAG_BUF_SIZE); + if (rc != OPAL_SUCCESS) { + pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", + __func__, hose->global_number, rc); + return; + } + + pnv_pci_dump_phb_diag_data(hose, phb->diag.blob); +} + /** * ioda_eeh_get_state - Retrieve the state of PE * @pe: EEH PE @@ -233,7 +267,7 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) { s64 ret = 0; u8 fstate; - u16 pcierr; + __be16 pcierr; u32 pe_no; int result; struct pci_controller *hose = pe->phb; @@ -251,6 +285,21 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) return EEH_STATE_NOT_SUPPORT; } + /* + * If we're in middle of PE reset, return normal + * state to keep EEH core going. For PHB reset, we + * still expect to have fenced PHB cleared with + * PHB reset. + */ + if (!(pe->type & EEH_PE_PHB) && + (pe->state & EEH_PE_RESET)) { + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED); + return result; + } + /* Retrieve PE status through OPAL */ pe_no = pe->addr; ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, @@ -267,11 +316,14 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) result = 0; result &= ~EEH_STATE_RESET_ACTIVE; - if (pcierr != OPAL_EEH_PHB_ERROR) { + if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) { result |= EEH_STATE_MMIO_ACTIVE; result |= EEH_STATE_DMA_ACTIVE; result |= EEH_STATE_MMIO_ENABLED; result |= EEH_STATE_DMA_ENABLED; + } else if (!(pe->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + ioda_eeh_phb_diag(hose); } return result; @@ -315,53 +367,16 @@ static int ioda_eeh_get_state(struct eeh_pe *pe) __func__, fstate, hose->global_number, pe_no); } - return result; -} - -static int ioda_eeh_pe_clear(struct eeh_pe *pe) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - u32 pe_no; - u8 fstate; - u16 pcierr; - s64 ret; - - pe_no = pe->addr; - hose = pe->phb; - phb = pe->phb->private_data; - - /* Clear the EEH error on the PE */ - ret = opal_pci_eeh_freeze_clear(phb->opal_id, - pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); - if (ret) { - pr_err("%s: Failed to clear EEH error for " - "PHB#%x-PE#%x, err=%lld\n", - __func__, hose->global_number, pe_no, ret); - return -EIO; + /* Dump PHB diag-data for frozen PE */ + if (result != EEH_STATE_NOT_SUPPORT && + (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) != + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) && + !(pe->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + ioda_eeh_phb_diag(hose); } - /* - * Read the PE state back and verify that the frozen - * state has been removed. - */ - ret = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, - &fstate, &pcierr, NULL); - if (ret) { - pr_err("%s: Failed to get EEH status on " - "PHB#%x-PE#%x\n, err=%lld\n", - __func__, hose->global_number, pe_no, ret); - return -EIO; - } - - if (fstate != OPAL_EEH_STOPPED_NOT_FROZEN) { - pr_err("%s: Frozen state not cleared on " - "PHB#%x-PE#%x, sts=%x\n", - __func__, hose->global_number, pe_no, fstate); - return -EIO; - } - - return 0; + return result; } static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) @@ -373,13 +388,16 @@ static s64 ioda_eeh_phb_poll(struct pnv_phb *phb) if (rc <= 0) break; - msleep(rc); + if (system_state < SYSTEM_RUNNING) + udelay(1000 * rc); + else + msleep(rc); } return rc; } -static int ioda_eeh_phb_reset(struct pci_controller *hose, int option) +int ioda_eeh_phb_reset(struct pci_controller *hose, int option) { struct pnv_phb *phb = hose->private_data; s64 rc = OPAL_HARDWARE; @@ -402,9 +420,17 @@ static int ioda_eeh_phb_reset(struct pci_controller *hose, int option) /* * Poll state of the PHB until the request is done - * successfully. + * successfully. The PHB reset is usually PHB complete + * reset followed by hot reset on root bus. So we also + * need the PCI bus settlement delay. */ rc = ioda_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) { + if (system_state < SYSTEM_RUNNING) + udelay(1000 * EEH_PE_RST_SETTLE_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + } out: if (rc != OPAL_SUCCESS) return -EIO; @@ -442,6 +468,8 @@ static int ioda_eeh_root_reset(struct pci_controller *hose, int option) /* Poll state of the PHB until the request is done */ rc = ioda_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) + msleep(EEH_PE_RST_SETTLE_TIME); out: if (rc != OPAL_SUCCESS) return -EIO; @@ -449,32 +477,71 @@ out: return 0; } -static int ioda_eeh_bridge_reset(struct pci_controller *hose, - struct pci_dev *dev, int option) +static int ioda_eeh_bridge_reset(struct pci_dev *dev, int option) + { - u16 ctrl; + struct device_node *dn = pci_device_to_OF_node(dev); + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + int aer = edev ? edev->aer_cap : 0; + u32 ctrl; - pr_debug("%s: Reset device %04x:%02x:%02x.%01x with option %d\n", - __func__, hose->global_number, dev->bus->number, - PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), option); + pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(dev->bus), + dev->bus->number, option); switch (option) { case EEH_RESET_FUNDAMENTAL: case EEH_RESET_HOT: - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + /* Don't report linkDown event */ + if (aer) { + eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl |= PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl |= PCI_BRIDGE_CTL_BUS_RESET; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); + msleep(EEH_PE_RST_HOLD_TIME); + break; case EEH_RESET_DEACTIVATE: - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + eeh_ops->write_config(dn, PCI_BRIDGE_CONTROL, 2, ctrl); + msleep(EEH_PE_RST_SETTLE_TIME); + + /* Continue reporting linkDown event */ + if (aer) { + eeh_ops->read_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl &= ~PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(dn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + break; } return 0; } +void pnv_pci_reset_secondary_bus(struct pci_dev *dev) +{ + struct pci_controller *hose; + + if (pci_is_root_bus(dev->bus)) { + hose = pci_bus_to_host(dev->bus); + ioda_eeh_root_reset(hose, EEH_RESET_HOT); + ioda_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); + } else { + ioda_eeh_bridge_reset(dev, EEH_RESET_HOT); + ioda_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); + } +} + /** * ioda_eeh_reset - Reset the indicated PE * @pe: EEH PE @@ -490,106 +557,38 @@ static int ioda_eeh_bridge_reset(struct pci_controller *hose, static int ioda_eeh_reset(struct eeh_pe *pe, int option) { struct pci_controller *hose = pe->phb; - struct eeh_dev *edev; - struct pci_dev *dev; + struct pci_bus *bus; int ret; /* - * Anyway, we have to clear the problematic state for the - * corresponding PE. However, we needn't do it if the PE - * is PHB associated. That means the PHB is having fatal - * errors and it needs reset. Further more, the AIB interface - * isn't reliable any more. - */ - if (!(pe->type & EEH_PE_PHB) && - (option == EEH_RESET_HOT || - option == EEH_RESET_FUNDAMENTAL)) { - ret = ioda_eeh_pe_clear(pe); - if (ret) - return -EIO; - } - - /* - * The rules applied to reset, either fundamental or hot reset: + * For PHB reset, we always have complete reset. For those PEs whose + * primary bus derived from root complex (root bus) or root port + * (usually bus#1), we apply hot or fundamental reset on the root port. + * For other PEs, we always have hot reset on the PE primary bus. * - * We always reset the direct upstream bridge of the PE. If the - * direct upstream bridge isn't root bridge, we always take hot - * reset no matter what option (fundamental or hot) is. Otherwise, - * we should do the reset according to the required option. + * Here, we have different design to pHyp, which always clear the + * frozen state during PE reset. However, the good idea here from + * benh is to keep frozen state before we get PE reset done completely + * (until BAR restore). With the frozen state, HW drops illegal IO + * or MMIO access, which can incur recrusive frozen PE during PE + * reset. The side effect is that EEH core has to clear the frozen + * state explicitly after BAR restore. */ if (pe->type & EEH_PE_PHB) { ret = ioda_eeh_phb_reset(hose, option); } else { - if (pe->type & EEH_PE_DEVICE) { - /* - * If it's device PE, we didn't refer to the parent - * PCI bus yet. So we have to figure it out indirectly. - */ - edev = list_first_entry(&pe->edevs, - struct eeh_dev, list); - dev = eeh_dev_to_pci_dev(edev); - dev = dev->bus->self; - } else { - /* - * If it's bus PE, the parent PCI bus is already there - * and just pick it up. - */ - dev = pe->bus->self; - } - - /* - * Do reset based on the fact that the direct upstream bridge - * is root bridge (port) or not. - */ - if (dev->bus->number == 0) + bus = eeh_pe_bus_get(pe); + if (pci_is_root_bus(bus) || + pci_is_root_bus(bus->parent)) ret = ioda_eeh_root_reset(hose, option); else - ret = ioda_eeh_bridge_reset(hose, dev, option); + ret = ioda_eeh_bridge_reset(bus->self, option); } return ret; } /** - * ioda_eeh_get_log - Retrieve error log - * @pe: EEH PE - * @severity: Severity level of the log - * @drv_log: buffer to store the log - * @len: space of the log buffer - * - * The function is used to retrieve error log from P7IOC. - */ -static int ioda_eeh_get_log(struct eeh_pe *pe, int severity, - char *drv_log, unsigned long len) -{ - s64 ret; - unsigned long flags; - struct pci_controller *hose = pe->phb; - struct pnv_phb *phb = hose->private_data; - - spin_lock_irqsave(&phb->lock, flags); - - ret = opal_pci_get_phb_diag_data2(phb->opal_id, - phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); - if (ret) { - spin_unlock_irqrestore(&phb->lock, flags); - pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n", - __func__, hose->global_number, pe->addr, ret); - return -EIO; - } - - /* - * FIXME: We probably need log the error in somewhere. - * Lets make it up in future. - */ - /* pr_info("%s", phb->diag.blob); */ - - spin_unlock_irqrestore(&phb->lock, flags); - - return 0; -} - -/** * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE * @pe: EEH PE * @@ -670,183 +669,6 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose) } } -static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose, - struct OpalIoPhbErrorCommon *common) -{ - struct OpalIoP7IOCPhbErrorData *data; - int i; - - data = (struct OpalIoP7IOCPhbErrorData *)common; - - pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n", - hose->global_number, common->version); - - pr_info(" brdgCtl: %08x\n", data->brdgCtl); - - pr_info(" portStatusReg: %08x\n", data->portStatusReg); - pr_info(" rootCmplxStatus: %08x\n", data->rootCmplxStatus); - pr_info(" busAgentStatus: %08x\n", data->busAgentStatus); - - pr_info(" deviceStatus: %08x\n", data->deviceStatus); - pr_info(" slotStatus: %08x\n", data->slotStatus); - pr_info(" linkStatus: %08x\n", data->linkStatus); - pr_info(" devCmdStatus: %08x\n", data->devCmdStatus); - pr_info(" devSecStatus: %08x\n", data->devSecStatus); - - pr_info(" rootErrorStatus: %08x\n", data->rootErrorStatus); - pr_info(" uncorrErrorStatus: %08x\n", data->uncorrErrorStatus); - pr_info(" corrErrorStatus: %08x\n", data->corrErrorStatus); - pr_info(" tlpHdr1: %08x\n", data->tlpHdr1); - pr_info(" tlpHdr2: %08x\n", data->tlpHdr2); - pr_info(" tlpHdr3: %08x\n", data->tlpHdr3); - pr_info(" tlpHdr4: %08x\n", data->tlpHdr4); - pr_info(" sourceId: %08x\n", data->sourceId); - - pr_info(" errorClass: %016llx\n", data->errorClass); - pr_info(" correlator: %016llx\n", data->correlator); - pr_info(" p7iocPlssr: %016llx\n", data->p7iocPlssr); - pr_info(" p7iocCsr: %016llx\n", data->p7iocCsr); - pr_info(" lemFir: %016llx\n", data->lemFir); - pr_info(" lemErrorMask: %016llx\n", data->lemErrorMask); - pr_info(" lemWOF: %016llx\n", data->lemWOF); - pr_info(" phbErrorStatus: %016llx\n", data->phbErrorStatus); - pr_info(" phbFirstErrorStatus: %016llx\n", data->phbFirstErrorStatus); - pr_info(" phbErrorLog0: %016llx\n", data->phbErrorLog0); - pr_info(" phbErrorLog1: %016llx\n", data->phbErrorLog1); - pr_info(" mmioErrorStatus: %016llx\n", data->mmioErrorStatus); - pr_info(" mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus); - pr_info(" mmioErrorLog0: %016llx\n", data->mmioErrorLog0); - pr_info(" mmioErrorLog1: %016llx\n", data->mmioErrorLog1); - pr_info(" dma0ErrorStatus: %016llx\n", data->dma0ErrorStatus); - pr_info(" dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus); - pr_info(" dma0ErrorLog0: %016llx\n", data->dma0ErrorLog0); - pr_info(" dma0ErrorLog1: %016llx\n", data->dma0ErrorLog1); - pr_info(" dma1ErrorStatus: %016llx\n", data->dma1ErrorStatus); - pr_info(" dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus); - pr_info(" dma1ErrorLog0: %016llx\n", data->dma1ErrorLog0); - pr_info(" dma1ErrorLog1: %016llx\n", data->dma1ErrorLog1); - - for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { - if ((data->pestA[i] >> 63) == 0 && - (data->pestB[i] >> 63) == 0) - continue; - - pr_info(" PE[%3d] PESTA: %016llx\n", i, data->pestA[i]); - pr_info(" PESTB: %016llx\n", data->pestB[i]); - } -} - -static void ioda_eeh_phb3_phb_diag(struct pci_controller *hose, - struct OpalIoPhbErrorCommon *common) -{ - struct OpalIoPhb3ErrorData *data; - int i; - - data = (struct OpalIoPhb3ErrorData*)common; - pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n\n", - hose->global_number, common->version); - - pr_info(" brdgCtl: %08x\n", data->brdgCtl); - - pr_info(" portStatusReg: %08x\n", data->portStatusReg); - pr_info(" rootCmplxStatus: %08x\n", data->rootCmplxStatus); - pr_info(" busAgentStatus: %08x\n", data->busAgentStatus); - - pr_info(" deviceStatus: %08x\n", data->deviceStatus); - pr_info(" slotStatus: %08x\n", data->slotStatus); - pr_info(" linkStatus: %08x\n", data->linkStatus); - pr_info(" devCmdStatus: %08x\n", data->devCmdStatus); - pr_info(" devSecStatus: %08x\n", data->devSecStatus); - - pr_info(" rootErrorStatus: %08x\n", data->rootErrorStatus); - pr_info(" uncorrErrorStatus: %08x\n", data->uncorrErrorStatus); - pr_info(" corrErrorStatus: %08x\n", data->corrErrorStatus); - pr_info(" tlpHdr1: %08x\n", data->tlpHdr1); - pr_info(" tlpHdr2: %08x\n", data->tlpHdr2); - pr_info(" tlpHdr3: %08x\n", data->tlpHdr3); - pr_info(" tlpHdr4: %08x\n", data->tlpHdr4); - pr_info(" sourceId: %08x\n", data->sourceId); - pr_info(" errorClass: %016llx\n", data->errorClass); - pr_info(" correlator: %016llx\n", data->correlator); - pr_info(" nFir: %016llx\n", data->nFir); - pr_info(" nFirMask: %016llx\n", data->nFirMask); - pr_info(" nFirWOF: %016llx\n", data->nFirWOF); - pr_info(" PhbPlssr: %016llx\n", data->phbPlssr); - pr_info(" PhbCsr: %016llx\n", data->phbCsr); - pr_info(" lemFir: %016llx\n", data->lemFir); - pr_info(" lemErrorMask: %016llx\n", data->lemErrorMask); - pr_info(" lemWOF: %016llx\n", data->lemWOF); - pr_info(" phbErrorStatus: %016llx\n", data->phbErrorStatus); - pr_info(" phbFirstErrorStatus: %016llx\n", data->phbFirstErrorStatus); - pr_info(" phbErrorLog0: %016llx\n", data->phbErrorLog0); - pr_info(" phbErrorLog1: %016llx\n", data->phbErrorLog1); - pr_info(" mmioErrorStatus: %016llx\n", data->mmioErrorStatus); - pr_info(" mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus); - pr_info(" mmioErrorLog0: %016llx\n", data->mmioErrorLog0); - pr_info(" mmioErrorLog1: %016llx\n", data->mmioErrorLog1); - pr_info(" dma0ErrorStatus: %016llx\n", data->dma0ErrorStatus); - pr_info(" dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus); - pr_info(" dma0ErrorLog0: %016llx\n", data->dma0ErrorLog0); - pr_info(" dma0ErrorLog1: %016llx\n", data->dma0ErrorLog1); - pr_info(" dma1ErrorStatus: %016llx\n", data->dma1ErrorStatus); - pr_info(" dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus); - pr_info(" dma1ErrorLog0: %016llx\n", data->dma1ErrorLog0); - pr_info(" dma1ErrorLog1: %016llx\n", data->dma1ErrorLog1); - - for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { - if ((data->pestA[i] >> 63) == 0 && - (data->pestB[i] >> 63) == 0) - continue; - - pr_info(" PE[%3d] PESTA: %016llx\n", i, data->pestA[i]); - pr_info(" PESTB: %016llx\n", data->pestB[i]); - } -} - -static void ioda_eeh_phb_diag(struct pci_controller *hose) -{ - struct pnv_phb *phb = hose->private_data; - struct OpalIoPhbErrorCommon *common; - long rc; - - rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, - PNV_PCI_DIAG_BUF_SIZE); - if (rc != OPAL_SUCCESS) { - pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", - __func__, hose->global_number, rc); - return; - } - - common = (struct OpalIoPhbErrorCommon *)phb->diag.blob; - switch (common->ioType) { - case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: - ioda_eeh_p7ioc_phb_diag(hose, common); - break; - case OPAL_PHB_ERROR_DATA_TYPE_PHB3: - ioda_eeh_phb3_phb_diag(hose, common); - break; - default: - pr_warning("%s: Unrecognized I/O chip %d\n", - __func__, common->ioType); - } -} - -static int ioda_eeh_get_phb_pe(struct pci_controller *hose, - struct eeh_pe **pe) -{ - struct eeh_pe *phb_pe; - - phb_pe = eeh_phb_pe_get(hose); - if (!phb_pe) { - pr_warning("%s Can't find PE for PHB#%d\n", - __func__, hose->global_number); - return -EEXIST; - } - - *pe = phb_pe; - return 0; -} - static int ioda_eeh_get_pe(struct pci_controller *hose, u16 pe_no, struct eeh_pe **pe) { @@ -854,7 +676,8 @@ static int ioda_eeh_get_pe(struct pci_controller *hose, struct eeh_dev dev; /* Find the PHB PE */ - if (ioda_eeh_get_phb_pe(hose, &phb_pe)) + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) return -EEXIST; /* Find the PE according to PE# */ @@ -862,11 +685,7 @@ static int ioda_eeh_get_pe(struct pci_controller *hose, dev.phb = hose; dev.pe_config_addr = pe_no; dev_pe = eeh_pe_get(&dev); - if (!dev_pe) { - pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n", - __func__, hose->global_number, pe_no); - return -EEXIST; - } + if (!dev_pe) return -EEXIST; *pe = dev_pe; return 0; @@ -884,28 +703,32 @@ static int ioda_eeh_get_pe(struct pci_controller *hose, */ static int ioda_eeh_next_error(struct eeh_pe **pe) { - struct pci_controller *hose, *tmp; + struct pci_controller *hose; struct pnv_phb *phb; - u64 frozen_pe_no; - u16 err_type, severity; + struct eeh_pe *phb_pe, *parent_pe; + __be64 frozen_pe_no; + __be16 err_type, severity; + int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); long rc; - int ret = 1; + int state, ret = EEH_NEXT_ERR_NONE; /* * While running here, it's safe to purge the event queue. * And we should keep the cached OPAL notifier event sychronized * between the kernel and firmware. */ - eeh_remove_event(NULL); + eeh_remove_event(NULL, false); opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry(hose, &hose_list, list_node) { /* * If the subordinate PCI buses of the PHB has been - * removed, we needn't take care of it any more. + * removed or is exactly under error recovery, we + * needn't take care of it any more. */ phb = hose->private_data; - if (phb->eeh_state & PNV_EEH_STATE_REMOVED) + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) continue; rc = opal_pci_next_error(phb->opal_id, @@ -920,8 +743,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) } /* If the PHB doesn't have error, stop processing */ - if (err_type == OPAL_EEH_NO_ERROR || - severity == OPAL_EEH_SEV_NO_ERROR) { + if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR || + be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) { pr_devel("%s: No error found on PHB#%x\n", __func__, hose->global_number); continue; @@ -933,66 +756,127 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) * specific PHB. */ pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n", - __func__, err_type, severity, - frozen_pe_no, hose->global_number); - switch (err_type) { + __func__, be16_to_cpu(err_type), be16_to_cpu(severity), + be64_to_cpu(frozen_pe_no), hose->global_number); + switch (be16_to_cpu(err_type)) { case OPAL_EEH_IOC_ERROR: - if (severity == OPAL_EEH_SEV_IOC_DEAD) { - list_for_each_entry_safe(hose, tmp, - &hose_list, list_node) { - phb = hose->private_data; - phb->eeh_state |= PNV_EEH_STATE_REMOVED; - } - + if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) { pr_err("EEH: dead IOC detected\n"); - ret = 4; - goto out; - } else if (severity == OPAL_EEH_SEV_INF) { + ret = EEH_NEXT_ERR_DEAD_IOC; + } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { pr_info("EEH: IOC informative error " "detected\n"); ioda_eeh_hub_diag(hose); + ret = EEH_NEXT_ERR_NONE; } break; case OPAL_EEH_PHB_ERROR: - if (severity == OPAL_EEH_SEV_PHB_DEAD) { - if (ioda_eeh_get_phb_pe(hose, pe)) - break; - - pr_err("EEH: dead PHB#%x detected\n", - hose->global_number); - phb->eeh_state |= PNV_EEH_STATE_REMOVED; - ret = 3; - goto out; - } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { - if (ioda_eeh_get_phb_pe(hose, pe)) - break; - - pr_err("EEH: fenced PHB#%x detected\n", - hose->global_number); - ret = 2; - goto out; - } else if (severity == OPAL_EEH_SEV_INF) { + if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) { + *pe = phb_pe; + pr_err("EEH: dead PHB#%x detected, " + "location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_DEAD_PHB; + } else if (be16_to_cpu(severity) == + OPAL_EEH_SEV_PHB_FENCED) { + *pe = phb_pe; + pr_err("EEH: Fenced PHB#%x detected, " + "location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_FENCED_PHB; + } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { pr_info("EEH: PHB#%x informative error " - "detected\n", - hose->global_number); + "detected, location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); ioda_eeh_phb_diag(hose); + ret = EEH_NEXT_ERR_NONE; } break; case OPAL_EEH_PE_ERROR: - if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) - break; + /* + * If we can't find the corresponding PE, we + * just try to unfreeze. + */ + if (ioda_eeh_get_pe(hose, + be64_to_cpu(frozen_pe_no), pe)) { + /* Try best to clear it */ + pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n", + hose->global_number, frozen_pe_no); + pr_info("EEH: PHB location: %s\n", + eeh_pe_loc_get(phb_pe)); + opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + ret = EEH_NEXT_ERR_NONE; + } else if ((*pe)->state & EEH_PE_ISOLATED) { + ret = EEH_NEXT_ERR_NONE; + } else { + pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", + (*pe)->addr, (*pe)->phb->global_number); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(*pe), eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_FROZEN_PE; + } + + break; + default: + pr_warn("%s: Unexpected error type %d\n", + __func__, be16_to_cpu(err_type)); + } + + /* + * EEH core will try recover from fenced PHB or + * frozen PE. In the time for frozen PE, EEH core + * enable IO path for that before collecting logs, + * but it ruins the site. So we have to dump the + * log in advance here. + */ + if ((ret == EEH_NEXT_ERR_FROZEN_PE || + ret == EEH_NEXT_ERR_FENCED_PHB) && + !((*pe)->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + ioda_eeh_phb_diag(hose); + } + + /* + * We probably have the frozen parent PE out there and + * we need have to handle frozen parent PE firstly. + */ + if (ret == EEH_NEXT_ERR_FROZEN_PE) { + parent_pe = (*pe)->parent; + while (parent_pe) { + /* Hit the ceiling ? */ + if (parent_pe->type & EEH_PE_PHB) + break; + + /* Frozen parent PE ? */ + state = ioda_eeh_get_state(parent_pe); + if (state > 0 && + (state & active_flags) != active_flags) + *pe = parent_pe; + + /* Next parent level */ + parent_pe = parent_pe->parent; + } - pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", - (*pe)->addr, (*pe)->phb->global_number); - ret = 1; - goto out; + /* We possibly migrate to another PE */ + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); } + + /* + * If we have no errors on the specific PHB or only + * informative error there, we continue poking it. + * Otherwise, we need actions to be taken by upper + * layer. + */ + if (ret > EEH_NEXT_ERR_INF) + break; } - ret = 0; -out: return ret; } @@ -1001,7 +885,6 @@ struct pnv_eeh_ops ioda_eeh_ops = { .set_option = ioda_eeh_set_option, .get_state = ioda_eeh_get_state, .reset = ioda_eeh_reset, - .get_log = ioda_eeh_get_log, .configure_bridge = ioda_eeh_configure_bridge, .next_error = ioda_eeh_next_error }; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 73b981438cc5..56a206f32f77 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -126,6 +126,7 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) edev->mode &= 0xFFFFFF00; if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) edev->mode |= EEH_DEV_BRIDGE; + edev->pcix_cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (pci_is_pcie(dev)) { edev->pcie_cap = pci_pcie_cap(dev); @@ -133,6 +134,9 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) edev->mode |= EEH_DEV_ROOT_PORT; else if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM) edev->mode |= EEH_DEV_DS_PORT; + + edev->aer_cap = pci_find_ext_capability(dev, + PCI_EXT_CAP_ID_ERR); } edev->config_addr = ((dev->bus->number << 8) | dev->devfn); @@ -145,7 +149,7 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_subsystem_enabled = 1; + eeh_set_enable(true); /* Save memory bars */ eeh_save_bars(edev); @@ -344,6 +348,27 @@ static int powernv_eeh_next_error(struct eeh_pe **pe) return -EEXIST; } +static int powernv_eeh_restore_config(struct device_node *dn) +{ + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + struct pnv_phb *phb; + s64 ret; + + if (!edev) + return -EEXIST; + + phb = edev->phb->private_data; + ret = opal_pci_reinit(phb->opal_id, + OPAL_REINIT_PCI_DEV, edev->config_addr); + if (ret) { + pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", + __func__, edev->config_addr, ret); + return -EIO; + } + + return 0; +} + static struct eeh_ops powernv_eeh_ops = { .name = "powernv", .init = powernv_eeh_init, @@ -359,7 +384,8 @@ static struct eeh_ops powernv_eeh_ops = { .configure_bridge = powernv_eeh_configure_bridge, .read_config = pnv_pci_cfg_read, .write_config = pnv_pci_cfg_write, - .next_error = powernv_eeh_next_error + .next_error = powernv_eeh_next_error, + .restore_config = powernv_eeh_restore_config }; /** diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c new file mode 100644 index 000000000000..32e2adfa5320 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -0,0 +1,204 @@ +/* + * PowerNV OPAL asynchronous completion interfaces + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/gfp.h> +#include <linux/of.h> +#include <asm/opal.h> + +#define N_ASYNC_COMPLETIONS 64 + +static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; +static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); +static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); +static DEFINE_SPINLOCK(opal_async_comp_lock); +static struct semaphore opal_async_sem; +static struct opal_msg *opal_async_responses; +static unsigned int opal_max_async_tokens; + +int __opal_async_get_token(void) +{ + unsigned long flags; + int token; + + spin_lock_irqsave(&opal_async_comp_lock, flags); + token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); + if (token >= opal_max_async_tokens) { + token = -EBUSY; + goto out; + } + + if (__test_and_set_bit(token, opal_async_token_map)) { + token = -EBUSY; + goto out; + } + + __clear_bit(token, opal_async_complete_map); + +out: + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + return token; +} + +int opal_async_get_token_interruptible(void) +{ + int token; + + /* Wait until a token is available */ + if (down_interruptible(&opal_async_sem)) + return -ERESTARTSYS; + + token = __opal_async_get_token(); + if (token < 0) + up(&opal_async_sem); + + return token; +} + +int __opal_async_release_token(int token) +{ + unsigned long flags; + + if (token < 0 || token >= opal_max_async_tokens) { + pr_err("%s: Passed token is out of range, token %d\n", + __func__, token); + return -EINVAL; + } + + spin_lock_irqsave(&opal_async_comp_lock, flags); + __set_bit(token, opal_async_complete_map); + __clear_bit(token, opal_async_token_map); + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + + return 0; +} + +int opal_async_release_token(int token) +{ + int ret; + + ret = __opal_async_release_token(token); + if (ret) + return ret; + + up(&opal_async_sem); + + return 0; +} + +int opal_async_wait_response(uint64_t token, struct opal_msg *msg) +{ + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); + memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + + return 0; +} + +static int opal_async_comp_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + struct opal_msg *comp_msg = msg; + unsigned long flags; + uint64_t token; + + if (msg_type != OPAL_MSG_ASYNC_COMP) + return 0; + + token = be64_to_cpu(comp_msg->params[0]); + memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); + spin_lock_irqsave(&opal_async_comp_lock, flags); + __set_bit(token, opal_async_complete_map); + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + + wake_up(&opal_async_wait); + + return 0; +} + +static struct notifier_block opal_async_comp_nb = { + .notifier_call = opal_async_comp_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_async_comp_init(void) +{ + struct device_node *opal_node; + const __be32 *async; + int err; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_err("%s: Opal node not found\n", __func__); + err = -ENOENT; + goto out; + } + + async = of_get_property(opal_node, "opal-msg-async-num", NULL); + if (!async) { + pr_err("%s: %s has no opal-msg-async-num\n", + __func__, opal_node->full_name); + err = -ENOENT; + goto out_opal_node; + } + + opal_max_async_tokens = be32_to_cpup(async); + if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) + opal_max_async_tokens = N_ASYNC_COMPLETIONS; + + err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, + &opal_async_comp_nb); + if (err) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, err); + goto out_opal_node; + } + + opal_async_responses = kzalloc( + sizeof(*opal_async_responses) * opal_max_async_tokens, + GFP_KERNEL); + if (!opal_async_responses) { + pr_err("%s: Out of memory, failed to do asynchronous " + "completion init\n", __func__); + err = -ENOMEM; + goto out_opal_node; + } + + /* Initialize to 1 less than the maximum tokens available, as we may + * require to pop one during emergency through synchronous call to + * __opal_async_get_token() + */ + sema_init(&opal_async_sem, opal_max_async_tokens - 1); + +out_opal_node: + of_node_put(opal_node); +out: + return err; +} +subsys_initcall(opal_async_comp_init); diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c new file mode 100644 index 000000000000..788a1977b9a5 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -0,0 +1,448 @@ +/* + * PowerNV OPAL Dump Interface + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +#define DUMP_TYPE_FSP 0x01 + +struct dump_obj { + struct kobject kobj; + struct bin_attribute dump_attr; + uint32_t id; /* becomes object name */ + uint32_t type; + uint32_t size; + char *buffer; +}; +#define to_dump_obj(x) container_of(x, struct dump_obj, kobj) + +struct dump_attribute { + struct attribute attr; + ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr, + char *buf); + ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr, + const char *buf, size_t count); +}; +#define to_dump_attr(x) container_of(x, struct dump_attribute, attr) + +static ssize_t dump_id_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%x\n", dump_obj->id); +} + +static const char* dump_type_to_string(uint32_t type) +{ + switch (type) { + case 0x01: return "SP Dump"; + case 0x02: return "System/Platform Dump"; + case 0x03: return "SMA Dump"; + default: return "unknown"; + } +} + +static ssize_t dump_type_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + + return sprintf(buf, "0x%x %s\n", dump_obj->type, + dump_type_to_string(dump_obj->type)); +} + +static ssize_t dump_ack_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "ack - acknowledge dump\n"); +} + +/* + * Send acknowledgement to OPAL + */ +static int64_t dump_send_ack(uint32_t dump_id) +{ + int rc; + + rc = opal_dump_ack(dump_id); + if (rc) + pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n", + __func__, dump_id, rc); + return rc; +} + +static ssize_t dump_ack_store(struct dump_obj *dump_obj, + struct dump_attribute *attr, + const char *buf, + size_t count) +{ + dump_send_ack(dump_obj->id); + sysfs_remove_file_self(&dump_obj->kobj, &attr->attr); + kobject_put(&dump_obj->kobj); + return count; +} + +/* Attributes of a dump + * The binary attribute of the dump itself is dynamic + * due to the dynamic size of the dump + */ +static struct dump_attribute id_attribute = + __ATTR(id, 0666, dump_id_show, NULL); +static struct dump_attribute type_attribute = + __ATTR(type, 0666, dump_type_show, NULL); +static struct dump_attribute ack_attribute = + __ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store); + +static ssize_t init_dump_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "1 - initiate dump\n"); +} + +static int64_t dump_fips_init(uint8_t type) +{ + int rc; + + rc = opal_dump_init(type); + if (rc) + pr_warn("%s: Failed to initiate FipS dump (%d)\n", + __func__, rc); + return rc; +} + +static ssize_t init_dump_store(struct dump_obj *dump_obj, + struct dump_attribute *attr, + const char *buf, + size_t count) +{ + dump_fips_init(DUMP_TYPE_FSP); + pr_info("%s: Initiated FSP dump\n", __func__); + return count; +} + +static struct dump_attribute initiate_attribute = + __ATTR(initiate_dump, 0600, init_dump_show, init_dump_store); + +static struct attribute *initiate_attrs[] = { + &initiate_attribute.attr, + NULL, +}; + +static struct attribute_group initiate_attr_group = { + .attrs = initiate_attrs, +}; + +static struct kset *dump_kset; + +static ssize_t dump_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct dump_attribute *attribute; + struct dump_obj *dump; + + attribute = to_dump_attr(attr); + dump = to_dump_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(dump, attribute, buf); +} + +static ssize_t dump_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct dump_attribute *attribute; + struct dump_obj *dump; + + attribute = to_dump_attr(attr); + dump = to_dump_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(dump, attribute, buf, len); +} + +static const struct sysfs_ops dump_sysfs_ops = { + .show = dump_attr_show, + .store = dump_attr_store, +}; + +static void dump_release(struct kobject *kobj) +{ + struct dump_obj *dump; + + dump = to_dump_obj(kobj); + vfree(dump->buffer); + kfree(dump); +} + +static struct attribute *dump_default_attrs[] = { + &id_attribute.attr, + &type_attribute.attr, + &ack_attribute.attr, + NULL, +}; + +static struct kobj_type dump_ktype = { + .sysfs_ops = &dump_sysfs_ops, + .release = &dump_release, + .default_attrs = dump_default_attrs, +}; + +static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type) +{ + __be32 id, size, type; + int rc; + + type = cpu_to_be32(0xffffffff); + + rc = opal_dump_info2(&id, &size, &type); + if (rc == OPAL_PARAMETER) + rc = opal_dump_info(&id, &size); + + *dump_id = be32_to_cpu(id); + *dump_size = be32_to_cpu(size); + *dump_type = be32_to_cpu(type); + + if (rc) + pr_warn("%s: Failed to get dump info (%d)\n", + __func__, rc); + return rc; +} + +static int64_t dump_read_data(struct dump_obj *dump) +{ + struct opal_sg_list *list; + uint64_t addr; + int64_t rc; + + /* Allocate memory */ + dump->buffer = vzalloc(PAGE_ALIGN(dump->size)); + if (!dump->buffer) { + pr_err("%s : Failed to allocate memory\n", __func__); + rc = -ENOMEM; + goto out; + } + + /* Generate SG list */ + list = opal_vmalloc_to_sg_list(dump->buffer, dump->size); + if (!list) { + rc = -ENOMEM; + goto out; + } + + /* First entry address */ + addr = __pa(list); + + /* Fetch data */ + rc = OPAL_BUSY_EVENT; + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_dump_read(dump->id, addr); + if (rc == OPAL_BUSY_EVENT) { + opal_poll_events(NULL); + msleep(20); + } + } + + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) + pr_warn("%s: Extract dump failed for ID 0x%x\n", + __func__, dump->id); + + /* Free SG list */ + opal_free_sg_list(list); + +out: + return rc; +} + +static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + ssize_t rc; + + struct dump_obj *dump = to_dump_obj(kobj); + + if (!dump->buffer) { + rc = dump_read_data(dump); + + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) { + vfree(dump->buffer); + dump->buffer = NULL; + + return -EIO; + } + if (rc == OPAL_PARTIAL) { + /* On a partial read, we just return EIO + * and rely on userspace to ask us to try + * again. + */ + pr_info("%s: Platform dump partially read.ID = 0x%x\n", + __func__, dump->id); + return -EIO; + } + } + + memcpy(buffer, dump->buffer + pos, count); + + /* You may think we could free the dump buffer now and retrieve + * it again later if needed, but due to current firmware limitation, + * that's not the case. So, once read into userspace once, + * we keep the dump around until it's acknowledged by userspace. + */ + + return count; +} + +static struct dump_obj *create_dump_obj(uint32_t id, size_t size, + uint32_t type) +{ + struct dump_obj *dump; + int rc; + + dump = kzalloc(sizeof(*dump), GFP_KERNEL); + if (!dump) + return NULL; + + dump->kobj.kset = dump_kset; + + kobject_init(&dump->kobj, &dump_ktype); + + sysfs_bin_attr_init(&dump->dump_attr); + + dump->dump_attr.attr.name = "dump"; + dump->dump_attr.attr.mode = 0400; + dump->dump_attr.size = size; + dump->dump_attr.read = dump_attr_read; + + dump->id = id; + dump->size = size; + dump->type = type; + + rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id); + if (rc) { + kobject_put(&dump->kobj); + return NULL; + } + + rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr); + if (rc) { + kobject_put(&dump->kobj); + return NULL; + } + + pr_info("%s: New platform dump. ID = 0x%x Size %u\n", + __func__, dump->id, dump->size); + + kobject_uevent(&dump->kobj, KOBJ_ADD); + + return dump; +} + +static int process_dump(void) +{ + int rc; + uint32_t dump_id, dump_size, dump_type; + struct dump_obj *dump; + char name[22]; + + rc = dump_read_info(&dump_id, &dump_size, &dump_type); + if (rc != OPAL_SUCCESS) + return rc; + + sprintf(name, "0x%x-0x%x", dump_type, dump_id); + + /* we may get notified twice, let's handle + * that gracefully and not create two conflicting + * entries. + */ + if (kset_find_obj(dump_kset, name)) + return 0; + + dump = create_dump_obj(dump_id, dump_size, dump_type); + if (!dump) + return -1; + + return 0; +} + +static void dump_work_fn(struct work_struct *work) +{ + process_dump(); +} + +static DECLARE_WORK(dump_work, dump_work_fn); + +static void schedule_process_dump(void) +{ + schedule_work(&dump_work); +} + +/* + * New dump available notification + * + * Once we get notification, we add sysfs entries for it. + * We only fetch the dump on demand, and create sysfs asynchronously. + */ +static int dump_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + if (events & OPAL_EVENT_DUMP_AVAIL) + schedule_process_dump(); + + return 0; +} + +static struct notifier_block dump_nb = { + .notifier_call = dump_event, + .next = NULL, + .priority = 0 +}; + +void __init opal_platform_dump_init(void) +{ + int rc; + + dump_kset = kset_create_and_add("dump", NULL, opal_kobj); + if (!dump_kset) { + pr_warn("%s: Failed to create dump kset\n", __func__); + return; + } + + rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group); + if (rc) { + pr_warn("%s: Failed to create initiate dump attr group\n", + __func__); + kobject_put(&dump_kset->kobj); + return; + } + + rc = opal_notifier_register(&dump_nb); + if (rc) { + pr_warn("%s: Can't register OPAL event notifier (%d)\n", + __func__, rc); + return; + } + + opal_dump_resend_notification(); +} diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c new file mode 100644 index 000000000000..10268c41d830 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -0,0 +1,315 @@ +/* + * Error log support on PowerNV. + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/fcntl.h> +#include <linux/kobject.h> +#include <asm/uaccess.h> +#include <asm/opal.h> + +struct elog_obj { + struct kobject kobj; + struct bin_attribute raw_attr; + uint64_t id; + uint64_t type; + size_t size; + char *buffer; +}; +#define to_elog_obj(x) container_of(x, struct elog_obj, kobj) + +struct elog_attribute { + struct attribute attr; + ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr, + char *buf); + ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr, + const char *buf, size_t count); +}; +#define to_elog_attr(x) container_of(x, struct elog_attribute, attr) + +static ssize_t elog_id_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%llx\n", elog_obj->id); +} + +static const char *elog_type_to_string(uint64_t type) +{ + switch (type) { + case 0: return "PEL"; + default: return "unknown"; + } +} + +static ssize_t elog_type_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%llx %s\n", + elog_obj->type, + elog_type_to_string(elog_obj->type)); +} + +static ssize_t elog_ack_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "ack - acknowledge log message\n"); +} + +static ssize_t elog_ack_store(struct elog_obj *elog_obj, + struct elog_attribute *attr, + const char *buf, + size_t count) +{ + opal_send_ack_elog(elog_obj->id); + sysfs_remove_file_self(&elog_obj->kobj, &attr->attr); + kobject_put(&elog_obj->kobj); + return count; +} + +static struct elog_attribute id_attribute = + __ATTR(id, 0666, elog_id_show, NULL); +static struct elog_attribute type_attribute = + __ATTR(type, 0666, elog_type_show, NULL); +static struct elog_attribute ack_attribute = + __ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store); + +static struct kset *elog_kset; + +static ssize_t elog_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct elog_attribute *attribute; + struct elog_obj *elog; + + attribute = to_elog_attr(attr); + elog = to_elog_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(elog, attribute, buf); +} + +static ssize_t elog_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct elog_attribute *attribute; + struct elog_obj *elog; + + attribute = to_elog_attr(attr); + elog = to_elog_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(elog, attribute, buf, len); +} + +static const struct sysfs_ops elog_sysfs_ops = { + .show = elog_attr_show, + .store = elog_attr_store, +}; + +static void elog_release(struct kobject *kobj) +{ + struct elog_obj *elog; + + elog = to_elog_obj(kobj); + kfree(elog->buffer); + kfree(elog); +} + +static struct attribute *elog_default_attrs[] = { + &id_attribute.attr, + &type_attribute.attr, + &ack_attribute.attr, + NULL, +}; + +static struct kobj_type elog_ktype = { + .sysfs_ops = &elog_sysfs_ops, + .release = &elog_release, + .default_attrs = elog_default_attrs, +}; + +/* Maximum size of a single log on FSP is 16KB */ +#define OPAL_MAX_ERRLOG_SIZE 16384 + +static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + int opal_rc; + + struct elog_obj *elog = to_elog_obj(kobj); + + /* We may have had an error reading before, so let's retry */ + if (!elog->buffer) { + elog->buffer = kzalloc(elog->size, GFP_KERNEL); + if (!elog->buffer) + return -EIO; + + opal_rc = opal_read_elog(__pa(elog->buffer), + elog->size, elog->id); + if (opal_rc != OPAL_SUCCESS) { + pr_err("ELOG: log read failed for log-id=%llx\n", + elog->id); + kfree(elog->buffer); + elog->buffer = NULL; + return -EIO; + } + } + + memcpy(buffer, elog->buffer + pos, count); + + return count; +} + +static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) +{ + struct elog_obj *elog; + int rc; + + elog = kzalloc(sizeof(*elog), GFP_KERNEL); + if (!elog) + return NULL; + + elog->kobj.kset = elog_kset; + + kobject_init(&elog->kobj, &elog_ktype); + + sysfs_bin_attr_init(&elog->raw_attr); + + elog->raw_attr.attr.name = "raw"; + elog->raw_attr.attr.mode = 0400; + elog->raw_attr.size = size; + elog->raw_attr.read = raw_attr_read; + + elog->id = id; + elog->size = size; + elog->type = type; + + elog->buffer = kzalloc(elog->size, GFP_KERNEL); + + if (elog->buffer) { + rc = opal_read_elog(__pa(elog->buffer), + elog->size, elog->id); + if (rc != OPAL_SUCCESS) { + pr_err("ELOG: log read failed for log-id=%llx\n", + elog->id); + kfree(elog->buffer); + elog->buffer = NULL; + } + } + + rc = kobject_add(&elog->kobj, NULL, "0x%llx", id); + if (rc) { + kobject_put(&elog->kobj); + return NULL; + } + + rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr); + if (rc) { + kobject_put(&elog->kobj); + return NULL; + } + + kobject_uevent(&elog->kobj, KOBJ_ADD); + + return elog; +} + +static void elog_work_fn(struct work_struct *work) +{ + __be64 size; + __be64 id; + __be64 type; + uint64_t elog_size; + uint64_t log_id; + uint64_t elog_type; + int rc; + char name[2+16+1]; + + rc = opal_get_elog_size(&id, &size, &type); + if (rc != OPAL_SUCCESS) { + pr_err("ELOG: Opal log read failed\n"); + return; + } + + elog_size = be64_to_cpu(size); + log_id = be64_to_cpu(id); + elog_type = be64_to_cpu(type); + + BUG_ON(elog_size > OPAL_MAX_ERRLOG_SIZE); + + if (elog_size >= OPAL_MAX_ERRLOG_SIZE) + elog_size = OPAL_MAX_ERRLOG_SIZE; + + sprintf(name, "0x%llx", log_id); + + /* we may get notified twice, let's handle + * that gracefully and not create two conflicting + * entries. + */ + if (kset_find_obj(elog_kset, name)) + return; + + create_elog_obj(log_id, elog_size, elog_type); +} + +static DECLARE_WORK(elog_work, elog_work_fn); + +static int elog_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + /* check for error log event */ + if (events & OPAL_EVENT_ERROR_LOG_AVAIL) + schedule_work(&elog_work); + return 0; +} + +static struct notifier_block elog_nb = { + .notifier_call = elog_event, + .next = NULL, + .priority = 0 +}; + +int __init opal_elog_init(void) +{ + int rc = 0; + + elog_kset = kset_create_and_add("elog", NULL, opal_kobj); + if (!elog_kset) { + pr_warn("%s: failed to create elog kset\n", __func__); + return -1; + } + + rc = opal_notifier_register(&elog_nb); + if (rc) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, rc); + return rc; + } + + /* We are now ready to pull error logs from opal. */ + opal_resend_pending_logs(); + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c index 6ffa6b1ec5b7..5c21d9c07f45 100644 --- a/arch/powerpc/platforms/powernv/opal-flash.c +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> +#include <linux/delay.h> #include <asm/opal.h> @@ -76,11 +77,8 @@ /* Validate buffer size */ #define VALIDATE_BUF_SIZE 4096 -/* XXX: Assume candidate image size is <= 256MB */ -#define MAX_IMAGE_SIZE 0x10000000 - -/* Flash sg list version */ -#define SG_LIST_VERSION (1UL) +/* XXX: Assume candidate image size is <= 1GB */ +#define MAX_IMAGE_SIZE 0x40000000 /* Image status */ enum { @@ -103,30 +101,9 @@ struct image_header_t { uint32_t size; }; -/* Scatter/gather entry */ -struct opal_sg_entry { - void *data; - long length; -}; - -/* We calculate number of entries based on PAGE_SIZE */ -#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry)) - -/* - * This struct is very similar but not identical to that - * needed by the opal flash update. All we need to do for - * opal is rewrite num_entries into a version/length and - * translate the pointers to absolute. - */ -struct opal_sg_list { - unsigned long num_entries; - struct opal_sg_list *next; - struct opal_sg_entry entry[SG_ENTRIES_PER_NODE]; -}; - struct validate_flash_t { int status; /* Return status */ - void *buf; /* Candiate image buffer */ + void *buf; /* Candidate image buffer */ uint32_t buf_size; /* Image size */ uint32_t result; /* Update results token */ }; @@ -152,11 +129,16 @@ static DEFINE_MUTEX(image_data_mutex); */ static inline void opal_flash_validate(void) { - struct validate_flash_t *args_buf = &validate_flash_data; + long ret; + void *buf = validate_flash_data.buf; + __be32 size = cpu_to_be32(validate_flash_data.buf_size); + __be32 result; + + ret = opal_validate_flash(__pa(buf), &size, &result); - args_buf->status = opal_validate_flash(__pa(args_buf->buf), - &(args_buf->buf_size), - &(args_buf->result)); + validate_flash_data.status = ret; + validate_flash_data.buf_size = be32_to_cpu(size); + validate_flash_data.result = be32_to_cpu(result); } /* @@ -289,94 +271,11 @@ static ssize_t manage_store(struct kobject *kobj, } /* - * Free sg list - */ -static void free_sg_list(struct opal_sg_list *list) -{ - struct opal_sg_list *sg1; - while (list) { - sg1 = list->next; - kfree(list); - list = sg1; - } - list = NULL; -} - -/* - * Build candidate image scatter gather list - * - * list format: - * ----------------------------------- - * | VER (8) | Entry length in bytes | - * ----------------------------------- - * | Pointer to next entry | - * ----------------------------------- - * | Address of memory area 1 | - * ----------------------------------- - * | Length of memory area 1 | - * ----------------------------------- - * | ......... | - * ----------------------------------- - * | ......... | - * ----------------------------------- - * | Address of memory area N | - * ----------------------------------- - * | Length of memory area N | - * ----------------------------------- - */ -static struct opal_sg_list *image_data_to_sglist(void) -{ - struct opal_sg_list *sg1, *list = NULL; - void *addr; - int size; - - addr = image_data.data; - size = image_data.size; - - sg1 = kzalloc((sizeof(struct opal_sg_list)), GFP_KERNEL); - if (!sg1) - return NULL; - - list = sg1; - sg1->num_entries = 0; - while (size > 0) { - /* Translate virtual address to physical address */ - sg1->entry[sg1->num_entries].data = - (void *)(vmalloc_to_pfn(addr) << PAGE_SHIFT); - - if (size > PAGE_SIZE) - sg1->entry[sg1->num_entries].length = PAGE_SIZE; - else - sg1->entry[sg1->num_entries].length = size; - - sg1->num_entries++; - if (sg1->num_entries >= SG_ENTRIES_PER_NODE) { - sg1->next = kzalloc((sizeof(struct opal_sg_list)), - GFP_KERNEL); - if (!sg1->next) { - pr_err("%s : Failed to allocate memory\n", - __func__); - goto nomem; - } - - sg1 = sg1->next; - sg1->num_entries = 0; - } - addr += PAGE_SIZE; - size -= PAGE_SIZE; - } - return list; -nomem: - free_sg_list(list); - return NULL; -} - -/* * OPAL update flash */ static int opal_flash_update(int op) { - struct opal_sg_list *sg, *list, *next; + struct opal_sg_list *list; unsigned long addr; int64_t rc = OPAL_PARAMETER; @@ -386,32 +285,13 @@ static int opal_flash_update(int op) goto flash; } - list = image_data_to_sglist(); + list = opal_vmalloc_to_sg_list(image_data.data, image_data.size); if (!list) goto invalid_img; /* First entry address */ addr = __pa(list); - /* Translate sg list address to absolute */ - for (sg = list; sg; sg = next) { - next = sg->next; - /* Don't translate NULL pointer for last entry */ - if (sg->next) - sg->next = (struct opal_sg_list *)__pa(sg->next); - else - sg->next = NULL; - - /* Make num_entries into the version/length field */ - sg->num_entries = (SG_LIST_VERSION << 56) | - (sg->num_entries * sizeof(struct opal_sg_entry) + 16); - } - - pr_alert("FLASH: Image is %u bytes\n", image_data.size); - pr_alert("FLASH: Image update requested\n"); - pr_alert("FLASH: Image will be updated during system reboot\n"); - pr_alert("FLASH: This will take several minutes. Do not power off!\n"); - flash: rc = opal_update_flash(addr); @@ -419,6 +299,47 @@ invalid_img: return rc; } +/* Return CPUs to OPAL before starting FW update */ +static void flash_return_cpu(void *info) +{ + int cpu = smp_processor_id(); + + if (!cpu_online(cpu)) + return; + + /* Disable IRQ */ + hard_irq_disable(); + + /* Return the CPU to OPAL */ + opal_return_cpu(); +} + +/* This gets called just before system reboots */ +void opal_flash_term_callback(void) +{ + struct cpumask mask; + + if (update_flash_data.status != FLASH_IMG_READY) + return; + + pr_alert("FLASH: Flashing new firmware\n"); + pr_alert("FLASH: Image is %u bytes\n", image_data.size); + pr_alert("FLASH: Performing flash and reboot/shutdown\n"); + pr_alert("FLASH: This will take several minutes. Do not power off!\n"); + + /* Small delay to help getting the above message out */ + msleep(500); + + /* Return secondary CPUs to firmware */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + if (!cpumask_empty(&mask)) + smp_call_function_many(&mask, + flash_return_cpu, NULL, false); + /* Hard disable interrupts */ + hard_irq_disable(); +} + /* * Show candidate image status */ @@ -500,7 +421,7 @@ static int alloc_image_buf(char *buffer, size_t count) memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t)); image_data.size = be32_to_cpu(image_header.size); - pr_debug("FLASH: Candiate image size = %u\n", image_data.size); + pr_debug("FLASH: Candidate image size = %u\n", image_data.size); if (image_data.size > MAX_IMAGE_SIZE) { pr_warn("FLASH: Too large image\n"); diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 79d83cad3d67..f04b4d8aca5a 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -12,12 +12,17 @@ #include <linux/kernel.h> #include <linux/of.h> #include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/slab.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/xics.h> #include <asm/opal.h> #include <asm/prom.h> +#include <asm/uaccess.h> +#include <asm/debug.h> static int opal_lpc_chip_id = -1; @@ -176,6 +181,152 @@ static const struct ppc_pci_io opal_lpc_io = { .outsl = opal_lpc_outsl, }; +#ifdef CONFIG_DEBUG_FS +struct lpc_debugfs_entry { + enum OpalLPCAddressType lpc_type; +}; + +static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_WRITE, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos, + &data, len); + if (rc) + return -ENXIO; + switch(len) { + case 4: + rc = __put_user((u32)data, (u32 __user *)ubuf); + break; + case 2: + rc = __put_user((u16)data, (u16 __user *)ubuf); + break; + default: + rc = __put_user((u8)data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_READ, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + switch(len) { + case 4: + rc = __get_user(data, (u32 __user *)ubuf); + break; + case 2: + rc = __get_user(data, (u16 __user *)ubuf); + break; + default: + rc = __get_user(data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + + rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos, + data, len); + if (rc) + return -ENXIO; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static const struct file_operations lpc_fops = { + .read = lpc_debug_read, + .write = lpc_debug_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static int opal_lpc_debugfs_create_type(struct dentry *folder, + const char *fname, + enum OpalLPCAddressType type) +{ + struct lpc_debugfs_entry *entry; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->lpc_type = type; + debugfs_create_file(fname, 0600, folder, entry, &lpc_fops); + return 0; +} + +static int opal_lpc_init_debugfs(void) +{ + struct dentry *root; + int rc = 0; + + if (opal_lpc_chip_id < 0) + return -ENODEV; + + root = debugfs_create_dir("lpc", powerpc_debugfs_root); + + rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); + rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); + rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW); + return rc; +} +device_initcall(opal_lpc_init_debugfs); +#endif /* CONFIG_DEBUG_FS */ + void opal_lpc_init(void) { struct device_node *np; diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c new file mode 100644 index 000000000000..b17a34b695ef --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -0,0 +1,146 @@ +/* + * OPAL asynchronus Memory error handling support in PowreNV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/opal.h> +#include <asm/cputable.h> + +static int opal_mem_err_nb_init; +static LIST_HEAD(opal_memory_err_list); +static DEFINE_SPINLOCK(opal_mem_err_lock); + +struct OpalMsgNode { + struct list_head list; + struct opal_msg msg; +}; + +static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt) +{ + uint64_t paddr_start, paddr_end; + + pr_debug("%s: Retrived memory error event, type: 0x%x\n", + __func__, merr_evt->type); + switch (merr_evt->type) { + case OPAL_MEM_ERR_TYPE_RESILIENCE: + paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end); + break; + case OPAL_MEM_ERR_TYPE_DYN_DALLOC: + paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end); + break; + default: + return; + } + + for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) { + memory_failure(paddr_start >> PAGE_SHIFT, 0, 0); + } +} + +static void handle_memory_error(void) +{ + unsigned long flags; + struct OpalMemoryErrorData *merr_evt; + struct OpalMsgNode *msg_node; + + spin_lock_irqsave(&opal_mem_err_lock, flags); + while (!list_empty(&opal_memory_err_list)) { + msg_node = list_entry(opal_memory_err_list.next, + struct OpalMsgNode, list); + list_del(&msg_node->list); + spin_unlock_irqrestore(&opal_mem_err_lock, flags); + + merr_evt = (struct OpalMemoryErrorData *) + &msg_node->msg.params[0]; + handle_memory_error_event(merr_evt); + kfree(msg_node); + spin_lock_irqsave(&opal_mem_err_lock, flags); + } + spin_unlock_irqrestore(&opal_mem_err_lock, flags); +} + +static void mem_error_handler(struct work_struct *work) +{ + handle_memory_error(); +} + +static DECLARE_WORK(mem_error_work, mem_error_handler); + +/* + * opal_memory_err_event - notifier handler that queues up the opal message + * to be preocessed later. + */ +static int opal_memory_err_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + unsigned long flags; + struct OpalMsgNode *msg_node; + + if (msg_type != OPAL_MSG_MEM_ERR) + return 0; + + msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); + if (!msg_node) { + pr_err("MEMORY_ERROR: out of memory, Opal message event not" + "handled\n"); + return -ENOMEM; + } + memcpy(&msg_node->msg, msg, sizeof(struct opal_msg)); + + spin_lock_irqsave(&opal_mem_err_lock, flags); + list_add(&msg_node->list, &opal_memory_err_list); + spin_unlock_irqrestore(&opal_mem_err_lock, flags); + + schedule_work(&mem_error_work); + return 0; +} + +static struct notifier_block opal_mem_err_nb = { + .notifier_call = opal_memory_err_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_mem_err_init(void) +{ + int ret; + + if (!opal_mem_err_nb_init) { + ret = opal_message_notifier_register( + OPAL_MSG_MEM_ERR, &opal_mem_err_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + opal_mem_err_nb_init = 1; + } + return 0; +} +subsys_initcall(opal_mem_err_init); diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c new file mode 100644 index 000000000000..44ed78af1a0d --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -0,0 +1,124 @@ +/* + * PowerNV OPAL in-memory console interface + * + * Copyright 2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/io.h> +#include <asm/opal.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/types.h> +#include <asm/barrier.h> + +/* OPAL in-memory console. Defined in OPAL source at core/console.c */ +struct memcons { + __be64 magic; +#define MEMCONS_MAGIC 0x6630696567726173L + __be64 obuf_phys; + __be64 ibuf_phys; + __be32 obuf_size; + __be32 ibuf_size; + __be32 out_pos; +#define MEMCONS_OUT_POS_WRAP 0x80000000u +#define MEMCONS_OUT_POS_MASK 0x00ffffffu + __be32 in_prod; + __be32 in_cons; +}; + +static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct memcons *mc = bin_attr->private; + const char *conbuf; + ssize_t ret; + size_t first_read = 0; + uint32_t out_pos, avail; + + if (!mc) + return -ENODEV; + + out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos)); + + /* Now we've read out_pos, put a barrier in before reading the new + * data it points to in conbuf. */ + smp_rmb(); + + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); + + /* When the buffer has wrapped, read from the out_pos marker to the end + * of the buffer, and then read the remaining data as in the un-wrapped + * case. */ + if (out_pos & MEMCONS_OUT_POS_WRAP) { + + out_pos &= MEMCONS_OUT_POS_MASK; + avail = be32_to_cpu(mc->obuf_size) - out_pos; + + ret = memory_read_from_buffer(to, count, &pos, + conbuf + out_pos, avail); + + if (ret < 0) + goto out; + + first_read = ret; + to += first_read; + count -= first_read; + pos -= avail; + + if (count <= 0) + goto out; + } + + /* Sanity check. The firmware should not do this to us. */ + if (out_pos > be32_to_cpu(mc->obuf_size)) { + pr_err("OPAL: memory console corruption. Aborting read.\n"); + return -EINVAL; + } + + ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos); + + if (ret < 0) + goto out; + + ret += first_read; +out: + return ret; +} + +static struct bin_attribute opal_msglog_attr = { + .attr = {.name = "msglog", .mode = 0444}, + .read = opal_msglog_read +}; + +void __init opal_msglog_init(void) +{ + u64 mcaddr; + struct memcons *mc; + + if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { + pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); + return; + } + + mc = phys_to_virt(mcaddr); + if (!mc) { + pr_warn("OPAL: memory console address is invalid\n"); + return; + } + + if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { + pr_warn("OPAL: memory console version is invalid\n"); + return; + } + + opal_msglog_attr.private = mc; + + if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0) + pr_warn("OPAL: sysfs file creation failed\n"); +} diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index 7d07c7e80ec0..b1885db8fdf3 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -18,6 +18,7 @@ #include <asm/opal.h> #include <asm/firmware.h> +#include <asm/machdep.h> static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) { @@ -48,8 +49,11 @@ unsigned long __init opal_get_boot_time(void) else mdelay(10); } - if (rc != OPAL_SUCCESS) + if (rc != OPAL_SUCCESS) { + ppc_md.get_rtc_time = NULL; + ppc_md.set_rtc_time = NULL; return 0; + } y_m_d = be32_to_cpu(__y_m_d); h_m_s_ms = be64_to_cpu(__h_m_s_ms); opal_to_tm(y_m_d, h_m_s_ms, &tm); diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c new file mode 100644 index 000000000000..10271ad1fac4 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -0,0 +1,66 @@ +/* + * PowerNV sensor code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/mutex.h> +#include <asm/opal.h> + +static DEFINE_MUTEX(opal_sensor_mutex); + +/* + * This will return sensor information to driver based on the requested sensor + * handle. A handle is an opaque id for the powernv, read by the driver from the + * device tree.. + */ +int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) +{ + int ret, token; + struct opal_msg msg; + __be32 data; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_err("%s: Couldn't get the token, returning\n", __func__); + ret = token; + goto out; + } + + mutex_lock(&opal_sensor_mutex); + ret = opal_sensor_read(sensor_hndl, token, &data); + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %d\n", + __func__, ret); + goto out_token; + } + + *sensor_data = be32_to_cpu(data); + ret = be64_to_cpu(msg.params[1]); + +out_token: + mutex_unlock(&opal_sensor_mutex); + opal_async_release_token(token); +out: + return ret; +} +EXPORT_SYMBOL_GPL(opal_get_sensor_data); diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c new file mode 100644 index 000000000000..9d1acf22a099 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -0,0 +1,304 @@ +/* + * PowerNV system parameter code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kobject.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/gfp.h> +#include <linux/stat.h> +#include <asm/opal.h> + +#define MAX_PARAM_DATA_LEN 64 + +static DEFINE_MUTEX(opal_sysparam_mutex); +static struct kobject *sysparam_kobj; +static void *param_data_buf; + +struct param_attr { + struct list_head list; + u32 param_id; + u32 param_size; + struct kobj_attribute kobj_attr; +}; + +static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) +{ + struct opal_msg msg; + ssize_t ret; + int token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("%s: Couldn't get the token, returning\n", + __func__); + ret = token; + goto out; + } + + ret = opal_get_param(token, param_id, (u64)buffer, length); + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %zd\n", + __func__, ret); + goto out_token; + } + + ret = be64_to_cpu(msg.params[1]); + +out_token: + opal_async_release_token(token); +out: + return ret; +} + +static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) +{ + struct opal_msg msg; + int ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("%s: Couldn't get the token, returning\n", + __func__); + ret = token; + goto out; + } + + ret = opal_set_param(token, param_id, (u64)buffer, length); + + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %d\n", + __func__, ret); + goto out_token; + } + + ret = be64_to_cpu(msg.params[1]); + +out_token: + opal_async_release_token(token); +out: + return ret; +} + +static ssize_t sys_param_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, char *buf) +{ + struct param_attr *attr = container_of(kobj_attr, struct param_attr, + kobj_attr); + ssize_t ret; + + mutex_lock(&opal_sysparam_mutex); + ret = opal_get_sys_param(attr->param_id, attr->param_size, + param_data_buf); + if (ret) + goto out; + + memcpy(buf, param_data_buf, attr->param_size); + + ret = attr->param_size; +out: + mutex_unlock(&opal_sysparam_mutex); + return ret; +} + +static ssize_t sys_param_store(struct kobject *kobj, + struct kobj_attribute *kobj_attr, const char *buf, size_t count) +{ + struct param_attr *attr = container_of(kobj_attr, struct param_attr, + kobj_attr); + ssize_t ret; + + /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */ + if (count > MAX_PARAM_DATA_LEN) + count = MAX_PARAM_DATA_LEN; + + mutex_lock(&opal_sysparam_mutex); + memcpy(param_data_buf, buf, count); + ret = opal_set_sys_param(attr->param_id, attr->param_size, + param_data_buf); + mutex_unlock(&opal_sysparam_mutex); + if (!ret) + ret = count; + return ret; +} + +void __init opal_sys_param_init(void) +{ + struct device_node *sysparam; + struct param_attr *attr; + u32 *id, *size; + int count, i; + u8 *perm; + + if (!opal_kobj) { + pr_warn("SYSPARAM: opal kobject is not available\n"); + goto out; + } + + sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj); + if (!sysparam_kobj) { + pr_err("SYSPARAM: Failed to create sysparam kobject\n"); + goto out; + } + + /* Allocate big enough buffer for any get/set transactions */ + param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL); + if (!param_data_buf) { + pr_err("SYSPARAM: Failed to allocate memory for param data " + "buf\n"); + goto out_kobj_put; + } + + sysparam = of_find_node_by_path("/ibm,opal/sysparams"); + if (!sysparam) { + pr_err("SYSPARAM: Opal sysparam node not found\n"); + goto out_param_buf; + } + + if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) { + pr_err("SYSPARAM: Opal sysparam node not compatible\n"); + goto out_node_put; + } + + /* Number of parameters exposed through DT */ + count = of_property_count_strings(sysparam, "param-name"); + if (count < 0) { + pr_err("SYSPARAM: No string found of property param-name in " + "the node %s\n", sysparam->name); + goto out_node_put; + } + + id = kzalloc(sizeof(*id) * count, GFP_KERNEL); + if (!id) { + pr_err("SYSPARAM: Failed to allocate memory to read parameter " + "id\n"); + goto out_node_put; + } + + size = kzalloc(sizeof(*size) * count, GFP_KERNEL); + if (!size) { + pr_err("SYSPARAM: Failed to allocate memory to read parameter " + "size\n"); + goto out_free_id; + } + + perm = kzalloc(sizeof(*perm) * count, GFP_KERNEL); + if (!perm) { + pr_err("SYSPARAM: Failed to allocate memory to read supported " + "action on the parameter"); + goto out_free_size; + } + + if (of_property_read_u32_array(sysparam, "param-id", id, count)) { + pr_err("SYSPARAM: Missing property param-id in the DT\n"); + goto out_free_perm; + } + + if (of_property_read_u32_array(sysparam, "param-len", size, count)) { + pr_err("SYSPARAM: Missing property param-len in the DT\n"); + goto out_free_perm; + } + + + if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) { + pr_err("SYSPARAM: Missing property param-perm in the DT\n"); + goto out_free_perm; + } + + attr = kzalloc(sizeof(*attr) * count, GFP_KERNEL); + if (!attr) { + pr_err("SYSPARAM: Failed to allocate memory for parameter " + "attributes\n"); + goto out_free_perm; + } + + /* For each of the parameters, populate the parameter attributes */ + for (i = 0; i < count; i++) { + if (size[i] > MAX_PARAM_DATA_LEN) { + pr_warn("SYSPARAM: Not creating parameter %d as size " + "exceeds buffer length\n", i); + continue; + } + + sysfs_attr_init(&attr[i].kobj_attr.attr); + attr[i].param_id = id[i]; + attr[i].param_size = size[i]; + if (of_property_read_string_index(sysparam, "param-name", i, + &attr[i].kobj_attr.attr.name)) + continue; + + /* If the parameter is read-only or read-write */ + switch (perm[i] & 3) { + case OPAL_SYSPARAM_READ: + attr[i].kobj_attr.attr.mode = S_IRUGO; + break; + case OPAL_SYSPARAM_WRITE: + attr[i].kobj_attr.attr.mode = S_IWUSR; + break; + case OPAL_SYSPARAM_RW: + attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR; + break; + default: + break; + } + + attr[i].kobj_attr.show = sys_param_show; + attr[i].kobj_attr.store = sys_param_store; + + if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) { + pr_err("SYSPARAM: Failed to create sysfs file %s\n", + attr[i].kobj_attr.attr.name); + goto out_free_attr; + } + } + + kfree(perm); + kfree(size); + kfree(id); + of_node_put(sysparam); + return; + +out_free_attr: + kfree(attr); +out_free_perm: + kfree(perm); +out_free_size: + kfree(size); +out_free_id: + kfree(id); +out_node_put: + of_node_put(sysparam); +out_param_buf: + kfree(param_data_buf); +out_kobj_put: + kobject_put(sysparam_kobj); +out: + return; +} diff --git a/arch/powerpc/platforms/powernv/opal-takeover.S b/arch/powerpc/platforms/powernv/opal-takeover.S deleted file mode 100644 index 3cd262897c27..000000000000 --- a/arch/powerpc/platforms/powernv/opal-takeover.S +++ /dev/null @@ -1,138 +0,0 @@ -/* - * PowerNV OPAL takeover assembly code, for use by prom_init.c - * - * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/ppc_asm.h> -#include <asm/hvcall.h> -#include <asm/asm-offsets.h> -#include <asm/opal.h> - -#define H_HAL_TAKEOVER 0x5124 -#define H_HAL_TAKEOVER_QUERY_MAGIC -1 - - .text -_GLOBAL(opal_query_takeover) - mfcr r0 - stw r0,8(r1) - std r3,STK_PARAM(R3)(r1) - std r4,STK_PARAM(R4)(r1) - li r3,H_HAL_TAKEOVER - li r4,H_HAL_TAKEOVER_QUERY_MAGIC - HVSC - ld r10,STK_PARAM(R3)(r1) - std r4,0(r10) - ld r10,STK_PARAM(R4)(r1) - std r5,0(r10) - lwz r0,8(r1) - mtcrf 0xff,r0 - blr - -_GLOBAL(opal_do_takeover) - mfcr r0 - stw r0,8(r1) - mflr r0 - std r0,16(r1) - bl __opal_do_takeover - ld r0,16(r1) - mtlr r0 - lwz r0,8(r1) - mtcrf 0xff,r0 - blr - -__opal_do_takeover: - ld r4,0(r3) - ld r5,0x8(r3) - ld r6,0x10(r3) - ld r7,0x18(r3) - ld r8,0x20(r3) - ld r9,0x28(r3) - ld r10,0x30(r3) - ld r11,0x38(r3) - li r3,H_HAL_TAKEOVER - HVSC - blr - - .globl opal_secondary_entry -opal_secondary_entry: - mr r31,r3 - mfmsr r11 - li r12,(MSR_SF | MSR_ISF)@highest - sldi r12,r12,48 - or r11,r11,r12 - mtmsrd r11 - isync - mfspr r4,SPRN_PIR - std r4,0(r3) -1: HMT_LOW - ld r4,8(r3) - cmpli cr0,r4,0 - beq 1b - HMT_MEDIUM -1: addi r3,r31,16 - bl __opal_do_takeover - b 1b - -_GLOBAL(opal_enter_rtas) - mflr r0 - std r0,16(r1) - stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ - - /* Because PROM is running in 32b mode, it clobbers the high order half - * of all registers that it saves. We therefore save those registers - * PROM might touch to the stack. (r0, r3-r13 are caller saved) - */ - SAVE_GPR(2, r1) - SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) - mfcr r10 - mfmsr r11 - std r10,_CCR(r1) - std r11,_MSR(r1) - - /* Get the PROM entrypoint */ - mtlr r5 - - /* Switch MSR to 32 bits mode - */ - li r12,1 - rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) - andc r11,r11,r12 - li r12,1 - rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) - andc r11,r11,r12 - mtmsrd r11 - isync - - /* Enter RTAS here... */ - blrl - - /* Just make sure that r1 top 32 bits didn't get - * corrupt by OF - */ - rldicl r1,r1,0,32 - - /* Restore the MSR (back to 64 bits) */ - ld r0,_MSR(r1) - MTMSRD(r0) - isync - - /* Restore other registers */ - REST_GPR(2, r1) - REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) - ld r4,_CCR(r1) - mtcr r4 - - addi r1,r1,PROM_FRAME_SIZE - ld r0,16(r1) - mtlr r0 - blr diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index e7806504e976..4abbff22a61f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -32,7 +32,7 @@ std r12,PACASAVEDMSR(r13); \ andc r12,r12,r0; \ mtmsrd r12,1; \ - LOAD_REG_ADDR(r0,.opal_return); \ + LOAD_REG_ADDR(r0,opal_return); \ mtlr r0; \ li r0,MSR_DR|MSR_IR|MSR_LE;\ andc r12,r12,r0; \ @@ -44,7 +44,7 @@ mtspr SPRN_HSRR0,r12; \ hrfid -_STATIC(opal_return) +opal_return: /* * Fixup endian on OPAL return... we should be able to simplify * this by instead converting the below trampoline to a set of @@ -61,6 +61,7 @@ _STATIC(opal_return) mtcr r4; rfid +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); @@ -123,6 +124,25 @@ OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 4fbf276ac99e..4cd2ea6c0dbe 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -71,11 +71,11 @@ static int opal_xscom_err_xlate(int64_t rc) } } -static u64 opal_scom_unmangle(u64 reg) +static u64 opal_scom_unmangle(u64 addr) { /* * XSCOM indirect addresses have the top bit set. Additionally - * the reset of the top 3 nibbles is always 0. + * the rest of the top 3 nibbles is always 0. * * Because the debugfs interface uses signed offsets and shifts * the address left by 3, we basically cannot use the top 4 bits @@ -86,10 +86,13 @@ static u64 opal_scom_unmangle(u64 reg) * conversion here. To leave room for further xscom address * expansion, we only clear out the top byte * + * For in-kernel use, we also support the real indirect bit, so + * we test for any of the top 5 bits + * */ - if (reg & (1ull << 59)) - reg = (reg & ~(0xffull << 56)) | (1ull << 63); - return reg; + if (addr & (0x1full << 59)) + addr = (addr & ~(0xffull << 56)) | (1ull << 63); + return addr; } static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) @@ -98,8 +101,8 @@ static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) int64_t rc; __be64 v; - reg = opal_scom_unmangle(reg); - rc = opal_xscom_read(m->chip, m->addr + reg, (__be64 *)__pa(&v)); + reg = opal_scom_unmangle(m->addr + reg); + rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); *value = be64_to_cpu(v); return opal_xscom_err_xlate(rc); } @@ -109,8 +112,8 @@ static int opal_scom_write(scom_map_t map, u64 reg, u64 value) struct opal_scom_map *m = map; int64_t rc; - reg = opal_scom_unmangle(reg); - rc = opal_xscom_write(m->chip, m->addr + reg, value); + reg = opal_scom_unmangle(m->addr + reg); + rc = opal_xscom_write(m->chip, reg, value); return opal_xscom_err_xlate(rc); } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 1c798cd55372..199975613fe9 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -18,9 +18,13 @@ #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/slab.h> +#include <linux/sched.h> #include <linux/kobject.h> +#include <linux/delay.h> +#include <linux/memblock.h> #include <asm/opal.h> #include <asm/firmware.h> +#include <asm/mce.h> #include "powernv.h" @@ -30,40 +34,70 @@ struct kobject *opal_kobj; struct opal { u64 base; u64 entry; + u64 size; } opal; -static struct device_node *opal_node; +struct mcheck_recoverable_range { + u64 start_addr; + u64 end_addr; + u64 recover_addr; +}; + +static struct mcheck_recoverable_range *mc_recoverable_range; +static int mc_recoverable_range_len; + +struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); extern u64 opal_mc_secondary_handler[]; static unsigned int *opal_irqs; static unsigned int opal_irq_count; static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); +static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static DEFINE_SPINLOCK(opal_notifier_lock); static uint64_t last_notified_mask = 0x0ul; static atomic_t opal_notifier_hold = ATOMIC_INIT(0); +static void opal_reinit_cores(void) +{ + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... + * + * It will preserve non volatile GPRs and HSPRG0/1. It will + * also restore HIDs and other SPRs to their original value + * but it might clobber a bunch. + */ +#ifdef __BIG_ENDIAN__ + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); +#else + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); +#endif +} + int __init early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data) { - const void *basep, *entryp; - unsigned long basesz, entrysz; + const void *basep, *entryp, *sizep; + int basesz, entrysz, runtimesz; if (depth != 1 || strcmp(uname, "ibm,opal") != 0) return 0; basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz); entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); + sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz); - if (!basep || !entryp) + if (!basep || !entryp || !sizep) return 1; opal.base = of_read_number(basep, basesz/4); opal.entry = of_read_number(entryp, entrysz/4); + opal.size = of_read_number(sizep, runtimesz/4); - pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n", + pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%d)\n", opal.base, basep, basesz); - pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n", + pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n", opal.entry, entryp, entrysz); + pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n", + opal.size, sizep, runtimesz); powerpc_firmware_features |= FW_FEATURE_OPAL; if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) { @@ -77,6 +111,72 @@ int __init early_init_dt_scan_opal(unsigned long node, printk("OPAL V1 detected !\n"); } + /* Reinit all cores with the right endian */ + opal_reinit_cores(); + + /* Restore some bits */ + if (cur_cpu_spec->cpu_restore) + cur_cpu_spec->cpu_restore(); + + return 1; +} + +int __init early_init_dt_scan_recoverable_ranges(unsigned long node, + const char *uname, int depth, void *data) +{ + int i, psize, size; + const __be32 *prop; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize); + + if (!prop) + return 1; + + pr_debug("Found machine check recoverable ranges.\n"); + + /* + * Calculate number of available entries. + * + * Each recoverable address range entry is (start address, len, + * recovery address), 2 cells each for start and recovery address, + * 1 cell for len, totalling 5 cells per entry. + */ + mc_recoverable_range_len = psize / (sizeof(*prop) * 5); + + /* Sanity check */ + if (!mc_recoverable_range_len) + return 1; + + /* Size required to hold all the entries. */ + size = mc_recoverable_range_len * + sizeof(struct mcheck_recoverable_range); + + /* + * Allocate a buffer to hold the MC recoverable ranges. We would be + * accessing them in real mode, hence it needs to be within + * RMO region. + */ + mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64), + ppc64_rma_size)); + memset(mc_recoverable_range, 0, size); + + for (i = 0; i < mc_recoverable_range_len; i++) { + mc_recoverable_range[i].start_addr = + of_read_number(prop + (i * 5) + 0, 2); + mc_recoverable_range[i].end_addr = + mc_recoverable_range[i].start_addr + + of_read_number(prop + (i * 5) + 2, 1); + mc_recoverable_range[i].recover_addr = + of_read_number(prop + (i * 5) + 3, 2); + + pr_debug("Machine check recoverable range: %llx..%llx: %llx\n", + mc_recoverable_range[i].start_addr, + mc_recoverable_range[i].end_addr, + mc_recoverable_range[i].recover_addr); + } return 1; } @@ -88,14 +188,10 @@ static int __init opal_register_exception_handlers(void) if (!(powerpc_firmware_features & FW_FEATURE_OPAL)) return -ENODEV; - /* Hookup some exception handlers. We use the fwnmi area at 0x7000 - * to provide the glue space to OPAL + /* Hookup some exception handlers except machine check. We use the + * fwnmi area at 0x7000 to provide the glue space to OPAL */ glue = 0x7000; - opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER, - __pa(opal_mc_secondary_handler[0]), - glue); - glue += 128; opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, 0, glue); glue += 128; @@ -118,6 +214,20 @@ int opal_notifier_register(struct notifier_block *nb) atomic_notifier_chain_register(&opal_notifier_head, nb); return 0; } +EXPORT_SYMBOL_GPL(opal_notifier_register); + +int opal_notifier_unregister(struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + + atomic_notifier_chain_unregister(&opal_notifier_head, nb); + return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_unregister); static void opal_do_notifier(uint64_t events) { @@ -154,14 +264,14 @@ void opal_notifier_update_evt(uint64_t evt_mask, void opal_notifier_enable(void) { int64_t rc; - uint64_t evt = 0; + __be64 evt = 0; atomic_set(&opal_notifier_hold, 0); /* Process pending events */ rc = opal_poll_events(&evt); if (rc == OPAL_SUCCESS && evt) - opal_do_notifier(evt); + opal_do_notifier(be64_to_cpu(evt)); } void opal_notifier_disable(void) @@ -169,6 +279,97 @@ void opal_notifier_disable(void) atomic_set(&opal_notifier_hold, 1); } +/* + * Opal message notifier based on message type. Allow subscribers to get + * notified for specific messgae type. + */ +int opal_message_notifier_register(enum OpalMessageType msg_type, + struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + if (msg_type > OPAL_MSG_TYPE_MAX) { + pr_warning("%s: Invalid message type argument (%d)\n", + __func__, msg_type); + return -EINVAL; + } + return atomic_notifier_chain_register( + &opal_msg_notifier_head[msg_type], nb); +} + +static void opal_message_do_notify(uint32_t msg_type, void *msg) +{ + /* notify subscribers */ + atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type], + msg_type, msg); +} + +static void opal_handle_message(void) +{ + s64 ret; + /* + * TODO: pre-allocate a message buffer depending on opal-msg-size + * value in /proc/device-tree. + */ + static struct opal_msg msg; + u32 type; + + ret = opal_get_msg(__pa(&msg), sizeof(msg)); + /* No opal message pending. */ + if (ret == OPAL_RESOURCE) + return; + + /* check for errors. */ + if (ret) { + pr_warning("%s: Failed to retrive opal message, err=%lld\n", + __func__, ret); + return; + } + + type = be32_to_cpu(msg.msg_type); + + /* Sanity check */ + if (type > OPAL_MSG_TYPE_MAX) { + pr_warning("%s: Unknown message type: %u\n", __func__, type); + return; + } + opal_message_do_notify(type, (void *)&msg); +} + +static int opal_message_notify(struct notifier_block *nb, + unsigned long events, void *change) +{ + if (events & OPAL_EVENT_MSG_PENDING) + opal_handle_message(); + return 0; +} + +static struct notifier_block opal_message_nb = { + .notifier_call = opal_message_notify, + .next = NULL, + .priority = 0, +}; + +static int __init opal_message_init(void) +{ + int ret, i; + + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) + ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); + + ret = opal_notifier_register(&opal_message_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + return 0; +} +early_initcall(opal_message_init); + int opal_get_chars(uint32_t vtermno, char *buf, int count) { s64 rc; @@ -180,7 +381,7 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0) return 0; len = cpu_to_be64(count); - rc = opal_console_read(vtermno, &len, buf); + rc = opal_console_read(vtermno, &len, buf); if (rc == OPAL_SUCCESS) return be64_to_cpu(len); return 0; @@ -254,119 +455,94 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) return written; } +static int opal_recover_mce(struct pt_regs *regs, + struct machine_check_event *evt) +{ + int recovered = 0; + uint64_t ea = get_mce_fault_addr(evt); + + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { + /* Platform corrected itself */ + recovered = 1; + } else if (ea && !is_kernel_addr(ea)) { + /* + * Faulting address is not in kernel text. We should be fine. + * We need to find which process uses this address. + * For now, kill the task if we have received exception when + * in userspace. + * + * TODO: Queue up this address for hwpoisioning later. + */ + if (user_mode(regs) && !is_global_init(current)) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else + recovered = 0; + } else if (user_mode(regs) && !is_global_init(current) && + evt->severity == MCE_SEV_ERROR_SYNC) { + /* + * If we have received a synchronous error when in userspace + * kill the task. + */ + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } + return recovered; +} + int opal_machine_check(struct pt_regs *regs) { - struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt; - struct opal_machine_check_event evt; - const char *level, *sevstr, *subtype; - static const char *opal_mc_ue_types[] = { - "Indeterminate", - "Instruction fetch", - "Page table walk ifetch", - "Load/Store", - "Page table walk Load/Store", - }; - static const char *opal_mc_slb_types[] = { - "Indeterminate", - "Parity", - "Multihit", - }; - static const char *opal_mc_erat_types[] = { - "Indeterminate", - "Parity", - "Multihit", - }; - static const char *opal_mc_tlb_types[] = { - "Indeterminate", - "Parity", - "Multihit", - }; - - /* Copy the event structure and release the original */ - evt = *opal_evt; - opal_evt->in_use = 0; + struct machine_check_event evt; + + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; /* Print things out */ - if (evt.version != OpalMCE_V1) { + if (evt.version != MCE_V1) { pr_err("Machine Check Exception, Unknown event version %d !\n", evt.version); return 0; } - switch(evt.severity) { - case OpalMCE_SEV_NO_ERROR: - level = KERN_INFO; - sevstr = "Harmless"; - break; - case OpalMCE_SEV_WARNING: - level = KERN_WARNING; - sevstr = ""; - break; - case OpalMCE_SEV_ERROR_SYNC: - level = KERN_ERR; - sevstr = "Severe"; - break; - case OpalMCE_SEV_FATAL: - default: - level = KERN_ERR; - sevstr = "Fatal"; - break; - } + machine_check_print_event_info(&evt); - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - evt.disposition == OpalMCE_DISPOSITION_RECOVERED ? - "Recovered" : "[Not recovered"); - printk("%s Initiator: %s\n", level, - evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown"); - switch(evt.error_type) { - case OpalMCE_ERROR_TYPE_UE: - subtype = evt.u.ue_error.ue_error_type < - ARRAY_SIZE(opal_mc_ue_types) ? - opal_mc_ue_types[evt.u.ue_error.ue_error_type] - : "Unknown"; - printk("%s Error type: UE [%s]\n", level, subtype); - if (evt.u.ue_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.ue_error.effective_address); - if (evt.u.ue_error.physical_address_provided) - printk("%s Physial address: %016llx\n", - level, evt.u.ue_error.physical_address); - break; - case OpalMCE_ERROR_TYPE_SLB: - subtype = evt.u.slb_error.slb_error_type < - ARRAY_SIZE(opal_mc_slb_types) ? - opal_mc_slb_types[evt.u.slb_error.slb_error_type] - : "Unknown"; - printk("%s Error type: SLB [%s]\n", level, subtype); - if (evt.u.slb_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.slb_error.effective_address); - break; - case OpalMCE_ERROR_TYPE_ERAT: - subtype = evt.u.erat_error.erat_error_type < - ARRAY_SIZE(opal_mc_erat_types) ? - opal_mc_erat_types[evt.u.erat_error.erat_error_type] - : "Unknown"; - printk("%s Error type: ERAT [%s]\n", level, subtype); - if (evt.u.erat_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.erat_error.effective_address); - break; - case OpalMCE_ERROR_TYPE_TLB: - subtype = evt.u.tlb_error.tlb_error_type < - ARRAY_SIZE(opal_mc_tlb_types) ? - opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type] - : "Unknown"; - printk("%s Error type: TLB [%s]\n", level, subtype); - if (evt.u.tlb_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt.u.tlb_error.effective_address); - break; - default: - case OpalMCE_ERROR_TYPE_UNKNOWN: - printk("%s Error type: Unknown\n", level); - break; - } - return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1; + if (opal_recover_mce(regs, &evt)) + return 1; + return 0; +} + +static uint64_t find_recovery_address(uint64_t nip) +{ + int i; + + for (i = 0; i < mc_recoverable_range_len; i++) + if ((nip >= mc_recoverable_range[i].start_addr) && + (nip < mc_recoverable_range[i].end_addr)) + return mc_recoverable_range[i].recover_addr; + return 0; +} + +bool opal_mce_check_early_recovery(struct pt_regs *regs) +{ + uint64_t recover_addr = 0; + + if (!opal.base || !opal.size) + goto out; + + if ((regs->nip >= opal.base) && + (regs->nip <= (opal.base + opal.size))) + recover_addr = find_recovery_address(regs->nip); + + /* + * Setup regs->nip to rfi into fixup address. + */ + if (recover_addr) + regs->nip = recover_addr; + +out: + return !!recover_addr; } static irqreturn_t opal_interrupt(int irq, void *data) @@ -375,7 +551,7 @@ static irqreturn_t opal_interrupt(int irq, void *data) opal_handle_interrupt(virq_to_hw(irq), &events); - opal_do_notifier(events); + opal_do_notifier(be64_to_cpu(events)); return IRQ_HANDLED; } @@ -440,8 +616,16 @@ static int __init opal_init(void) /* Create "opal" kobject under /sys/firmware */ rc = opal_sysfs_init(); if (rc == 0) { + /* Setup error log interface */ + rc = opal_elog_init(); /* Setup code update interface */ opal_flash_init(); + /* Setup platform dump extract interface */ + opal_platform_dump_init(); + /* Setup system parameters interface */ + opal_sys_param_init(); + /* Setup message log interface. */ + opal_msglog_init(); } return 0; @@ -451,10 +635,91 @@ subsys_initcall(opal_init); void opal_shutdown(void) { unsigned int i; + long rc = OPAL_BUSY; + /* First free interrupts, which will also mask them */ for (i = 0; i < opal_irq_count; i++) { if (opal_irqs[i]) free_irq(opal_irqs[i], NULL); opal_irqs[i] = 0; } + + /* + * Then sync with OPAL which ensure anything that can + * potentially write to our memory has completed such + * as an ongoing dump retrieval + */ + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_sync_host_reboot(); + if (rc == OPAL_BUSY) + opal_poll_events(NULL); + else + mdelay(10); + } +} + +/* Export this so that test modules can use it */ +EXPORT_SYMBOL_GPL(opal_invalid_call); + +/* Convert a region of vmalloc memory to an opal sg list */ +struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, + unsigned long vmalloc_size) +{ + struct opal_sg_list *sg, *first = NULL; + unsigned long i = 0; + + sg = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!sg) + goto nomem; + + first = sg; + + while (vmalloc_size > 0) { + uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT; + uint64_t length = min(vmalloc_size, PAGE_SIZE); + + sg->entry[i].data = cpu_to_be64(data); + sg->entry[i].length = cpu_to_be64(length); + i++; + + if (i >= SG_ENTRIES_PER_NODE) { + struct opal_sg_list *next; + + next = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!next) + goto nomem; + + sg->length = cpu_to_be64( + i * sizeof(struct opal_sg_entry) + 16); + i = 0; + sg->next = cpu_to_be64(__pa(next)); + sg = next; + } + + vmalloc_addr += length; + vmalloc_size -= length; + } + + sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16); + + return first; + +nomem: + pr_err("%s : Failed to allocate memory\n", __func__); + opal_free_sg_list(first); + return NULL; +} + +void opal_free_sg_list(struct opal_sg_list *sg) +{ + while (sg) { + uint64_t next = be64_to_cpu(sg->next); + + kfree(sg); + + if (next) + sg = __va(next); + else + sg = NULL; + } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2c6d173842b2..de19edeaa7a7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/pci.h> +#include <linux/crash_dump.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/string.h> @@ -21,6 +22,7 @@ #include <linux/irq.h> #include <linux/io.h> #include <linux/msi.h> +#include <linux/memblock.h> #include <asm/sections.h> #include <asm/io.h> @@ -342,7 +344,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) pci_name(dev)); continue; } - pci_dev_get(dev); pdn->pcidev = dev; pdn->pe_number = pe->pe_number; pe->dma_weight += pnv_ioda_dma_weight(dev); @@ -460,15 +461,45 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev return; pe = &phb->ioda.pe_array[pdn->pe_number]; + WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); set_iommu_table_base(&pdev->dev, &pe->tce32_table); } +static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, + struct pci_dev *pdev, u64 dma_mask) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + uint64_t top; + bool bypass = false; + + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return -ENODEV;; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + if (pe->tce_bypass_enabled) { + top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + bypass = (dma_mask >= top); + } + + if (bypass) { + dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); + set_dma_ops(&pdev->dev, &dma_direct_ops); + set_dma_offset(&pdev->dev, pe->tce_bypass_base); + } else { + dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(&pdev->dev, &dma_iommu_ops); + set_iommu_table_base(&pdev->dev, &pe->tce32_table); + } + return 0; +} + static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, &pe->tce32_table); + set_iommu_table_base_and_group(&dev->dev, &pe->tce32_table); if (dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); } @@ -633,18 +664,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, * errors, and on the first pass the data will be a relative * bus number, print that out instead. */ - tbl->it_busno = 0; pe->tce_inval_reg_phys = be64_to_cpup(swinvp); tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 8); - tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE | - TCE_PCI_SWINV_PAIR; + tbl->it_type |= (TCE_PCI_SWINV_CREATE | + TCE_PCI_SWINV_FREE | + TCE_PCI_SWINV_PAIR); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number); + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); if (pe->pdev) - set_iommu_table_base(&pe->pdev->dev, tbl); + set_iommu_table_base_and_group(&pe->pdev->dev, tbl); else pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -657,6 +688,56 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); } +static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +{ + struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, + tce32_table); + uint16_t window_id = (pe->pe_number << 1 ) + 1; + int64_t rc; + + pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); + if (enable) { + phys_addr_t top = memblock_end_of_DRAM(); + + top = roundup_pow_of_two(top); + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + window_id, + pe->tce_bypass_base, + top); + } else { + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + window_id, + pe->tce_bypass_base, + 0); + + /* + * We might want to reset the DMA ops of all devices on + * this PE. However in theory, that shouldn't be necessary + * as this is used for VFIO/KVM pass-through and the device + * hasn't yet been returned to its kernel driver + */ + } + if (rc) + pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); + else + pe->tce_bypass_enabled = enable; +} + +static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + /* TVE #1 is selected by PCI address bit 59 */ + pe->tce_bypass_base = 1ull << 59; + + /* Install set_bypass callback for VFIO */ + pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + + /* Enable bypass by default */ + pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); +} + static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { @@ -713,20 +794,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, * errors, and on the first pass the data will be a relative * bus number, print that out instead. */ - tbl->it_busno = 0; pe->tce_inval_reg_phys = be64_to_cpup(swinvp); tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, 8); - tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE; + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number); + iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); if (pe->pdev) - set_iommu_table_base(&pe->pdev->dev, tbl); + set_iommu_table_base_and_group(&pe->pdev->dev, tbl); else pnv_ioda_setup_bus_dma(pe, pe->pbus); + /* Also create a bypass window */ + pnv_pci_ioda2_setup_bypass_pe(phb, pe); return; fail: if (pe->tce32_seg >= 0) @@ -1144,7 +1226,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, { struct pci_controller *hose; struct pnv_phb *phb; - unsigned long size, m32map_off, iomap_off, pemap_off; + unsigned long size, m32map_off, pemap_off, iomap_off = 0; const __be64 *prop64; const __be32 *prop32; int len; @@ -1231,7 +1313,6 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); m32map_off = size; size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); - iomap_off = size; if (phb->type == PNV_PHB_IODA1) { iomap_off = size; size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); @@ -1287,6 +1368,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Setup TCEs */ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; /* Setup shutdown function for kexec */ phb->shutdown = pnv_pci_ioda_shutdown; @@ -1304,12 +1386,24 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np, ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; ppc_md.pcibios_window_alignment = pnv_pci_window_alignment; + ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus; pci_add_flags(PCI_REASSIGN_ALL_RSRC); /* Reset IODA tables to a clean state */ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); if (rc) pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); + + /* If we're running in kdump kerenl, the previous kerenl never + * shutdown PCI devices correctly. We already got IODA table + * cleaned out. So we have to issue PHB reset to stop all PCI + * transactions from previous kerenl. + */ + if (is_kdump_kernel()) { + pr_info(" Issue PHB reset ...\n"); + ioda_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); + ioda_eeh_phb_reset(hose, OPAL_DEASSERT_RESET); + } } void __init pnv_pci_init_ioda2_phb(struct device_node *np) diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index f8b4bd8afb2e..e3807d69393e 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -92,7 +92,7 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, pci_domain_nr(phb->hose->bus), phb->opal_id); } - set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table); + set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); } static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 4eb33a9ed532..f91a4e5d872e 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -124,77 +124,195 @@ static void pnv_teardown_msi_irqs(struct pci_dev *pdev) } #endif /* CONFIG_PCI_MSI */ -static void pnv_pci_dump_p7ioc_diag_data(struct pnv_phb *phb) +static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) { - struct OpalIoP7IOCPhbErrorData *data = &phb->diag.p7ioc; + struct OpalIoP7IOCPhbErrorData *data; int i; - pr_info("PHB %d diagnostic data:\n", phb->hose->global_number); - - pr_info(" brdgCtl = 0x%08x\n", data->brdgCtl); - - pr_info(" portStatusReg = 0x%08x\n", data->portStatusReg); - pr_info(" rootCmplxStatus = 0x%08x\n", data->rootCmplxStatus); - pr_info(" busAgentStatus = 0x%08x\n", data->busAgentStatus); - - pr_info(" deviceStatus = 0x%08x\n", data->deviceStatus); - pr_info(" slotStatus = 0x%08x\n", data->slotStatus); - pr_info(" linkStatus = 0x%08x\n", data->linkStatus); - pr_info(" devCmdStatus = 0x%08x\n", data->devCmdStatus); - pr_info(" devSecStatus = 0x%08x\n", data->devSecStatus); - - pr_info(" rootErrorStatus = 0x%08x\n", data->rootErrorStatus); - pr_info(" uncorrErrorStatus = 0x%08x\n", data->uncorrErrorStatus); - pr_info(" corrErrorStatus = 0x%08x\n", data->corrErrorStatus); - pr_info(" tlpHdr1 = 0x%08x\n", data->tlpHdr1); - pr_info(" tlpHdr2 = 0x%08x\n", data->tlpHdr2); - pr_info(" tlpHdr3 = 0x%08x\n", data->tlpHdr3); - pr_info(" tlpHdr4 = 0x%08x\n", data->tlpHdr4); - pr_info(" sourceId = 0x%08x\n", data->sourceId); - - pr_info(" errorClass = 0x%016llx\n", data->errorClass); - pr_info(" correlator = 0x%016llx\n", data->correlator); - - pr_info(" p7iocPlssr = 0x%016llx\n", data->p7iocPlssr); - pr_info(" p7iocCsr = 0x%016llx\n", data->p7iocCsr); - pr_info(" lemFir = 0x%016llx\n", data->lemFir); - pr_info(" lemErrorMask = 0x%016llx\n", data->lemErrorMask); - pr_info(" lemWOF = 0x%016llx\n", data->lemWOF); - pr_info(" phbErrorStatus = 0x%016llx\n", data->phbErrorStatus); - pr_info(" phbFirstErrorStatus = 0x%016llx\n", data->phbFirstErrorStatus); - pr_info(" phbErrorLog0 = 0x%016llx\n", data->phbErrorLog0); - pr_info(" phbErrorLog1 = 0x%016llx\n", data->phbErrorLog1); - pr_info(" mmioErrorStatus = 0x%016llx\n", data->mmioErrorStatus); - pr_info(" mmioFirstErrorStatus = 0x%016llx\n", data->mmioFirstErrorStatus); - pr_info(" mmioErrorLog0 = 0x%016llx\n", data->mmioErrorLog0); - pr_info(" mmioErrorLog1 = 0x%016llx\n", data->mmioErrorLog1); - pr_info(" dma0ErrorStatus = 0x%016llx\n", data->dma0ErrorStatus); - pr_info(" dma0FirstErrorStatus = 0x%016llx\n", data->dma0FirstErrorStatus); - pr_info(" dma0ErrorLog0 = 0x%016llx\n", data->dma0ErrorLog0); - pr_info(" dma0ErrorLog1 = 0x%016llx\n", data->dma0ErrorLog1); - pr_info(" dma1ErrorStatus = 0x%016llx\n", data->dma1ErrorStatus); - pr_info(" dma1FirstErrorStatus = 0x%016llx\n", data->dma1FirstErrorStatus); - pr_info(" dma1ErrorLog0 = 0x%016llx\n", data->dma1ErrorLog0); - pr_info(" dma1ErrorLog1 = 0x%016llx\n", data->dma1ErrorLog1); + data = (struct OpalIoP7IOCPhbErrorData *)common; + pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n", + hose->global_number, common->version); + + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + data->brdgCtl); + if (data->portStatusReg || data->rootCmplxStatus || + data->busAgentStatus) + pr_info("UtlSts: %08x %08x %08x\n", + data->portStatusReg, data->rootCmplxStatus, + data->busAgentStatus); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + data->deviceStatus, data->slotStatus, + data->linkStatus, data->devCmdStatus, + data->devSecStatus); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + data->rootErrorStatus, data->uncorrErrorStatus, + data->corrErrorStatus); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + data->tlpHdr1, data->tlpHdr2, + data->tlpHdr3, data->tlpHdr4); + if (data->sourceId || data->errorClass || + data->correlator) + pr_info("RootErrLog1: %08x %016llx %016llx\n", + data->sourceId, data->errorClass, + data->correlator); + if (data->p7iocPlssr || data->p7iocCsr) + pr_info("PhbSts: %016llx %016llx\n", + data->p7iocPlssr, data->p7iocCsr); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + data->lemFir, data->lemErrorMask, + data->lemWOF); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + data->phbErrorStatus, data->phbFirstErrorStatus, + data->phbErrorLog0, data->phbErrorLog1); + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", + data->mmioErrorStatus, data->mmioFirstErrorStatus, + data->mmioErrorLog0, data->mmioErrorLog1); + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", + data->dma0ErrorStatus, data->dma0FirstErrorStatus, + data->dma0ErrorLog0, data->dma0ErrorLog1); + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", + data->dma1ErrorStatus, data->dma1FirstErrorStatus, + data->dma1ErrorLog0, data->dma1ErrorLog1); for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { if ((data->pestA[i] >> 63) == 0 && (data->pestB[i] >> 63) == 0) continue; - pr_info(" PE[%3d] PESTA = 0x%016llx\n", i, data->pestA[i]); - pr_info(" PESTB = 0x%016llx\n", data->pestB[i]); + + pr_info("PE[%3d] A/B: %016llx %016llx\n", + i, data->pestA[i], data->pestB[i]); + } +} + +static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoPhb3ErrorData *data; + int i; + + data = (struct OpalIoPhb3ErrorData*)common; + pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->portStatusReg || data->rootCmplxStatus || + data->busAgentStatus) + pr_info("UtlSts: %08x %08x %08x\n", + be32_to_cpu(data->portStatusReg), + be32_to_cpu(data->rootCmplxStatus), + be32_to_cpu(data->busAgentStatus)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId || data->errorClass || + data->correlator) + pr_info("RootErrLog1: %08x %016llx %016llx\n", + be32_to_cpu(data->sourceId), + be64_to_cpu(data->errorClass), + be64_to_cpu(data->correlator)); + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", + be64_to_cpu(data->nFir), + be64_to_cpu(data->nFirMask), + be64_to_cpu(data->nFirWOF)); + if (data->phbPlssr || data->phbCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->phbPlssr), + be64_to_cpu(data->phbCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->mmioErrorStatus), + be64_to_cpu(data->mmioFirstErrorStatus), + be64_to_cpu(data->mmioErrorLog0), + be64_to_cpu(data->mmioErrorLog1)); + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma0ErrorStatus), + be64_to_cpu(data->dma0FirstErrorStatus), + be64_to_cpu(data->dma0ErrorLog0), + be64_to_cpu(data->dma0ErrorLog1)); + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma1ErrorStatus), + be64_to_cpu(data->dma1FirstErrorStatus), + be64_to_cpu(data->dma1ErrorLog0), + be64_to_cpu(data->dma1ErrorLog1)); + + for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { + if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && + (be64_to_cpu(data->pestB[i]) >> 63) == 0) + continue; + + pr_info("PE[%3d] A/B: %016llx %016llx\n", + i, be64_to_cpu(data->pestA[i]), + be64_to_cpu(data->pestB[i])); } } -static void pnv_pci_dump_phb_diag_data(struct pnv_phb *phb) +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, + unsigned char *log_buff) { - switch(phb->model) { - case PNV_PHB_MODEL_P7IOC: - pnv_pci_dump_p7ioc_diag_data(phb); + struct OpalIoPhbErrorCommon *common; + + if (!hose || !log_buff) + return; + + common = (struct OpalIoPhbErrorCommon *)log_buff; + switch (be32_to_cpu(common->ioType)) { + case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: + pnv_pci_dump_p7ioc_diag_data(hose, common); + break; + case OPAL_PHB_ERROR_DATA_TYPE_PHB3: + pnv_pci_dump_phb3_diag_data(hose, common); break; default: - pr_warning("PCI %d: Can't decode this PHB diag data\n", - phb->hose->global_number); + pr_warn("%s: Unrecognized ioType %d\n", + __func__, be32_to_cpu(common->ioType)); } } @@ -222,7 +340,7 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) * with the normal errors generated when probing empty slots */ if (has_diag) - pnv_pci_dump_phb_diag_data(phb); + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); else pr_warning("PCI %d: No diag data available\n", phb->hose->global_number); @@ -274,9 +392,6 @@ int pnv_pci_cfg_read(struct device_node *dn, struct pci_dn *pdn = PCI_DN(dn); struct pnv_phb *phb = pdn->phb->private_data; u32 bdfn = (pdn->busno << 8) | pdn->devfn; -#ifdef CONFIG_EEH - struct eeh_pe *phb_pe = NULL; -#endif s64 rc; switch (size) { @@ -302,31 +417,9 @@ int pnv_pci_cfg_read(struct device_node *dn, default: return PCIBIOS_FUNC_NOT_SUPPORTED; } + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", __func__, pdn->busno, pdn->devfn, where, size, *val); - - /* - * Check if the specified PE has been put into frozen - * state. On the other hand, we needn't do that while - * the PHB has been put into frozen state because of - * PHB-fatal errors. - */ -#ifdef CONFIG_EEH - phb_pe = eeh_phb_pe_get(pdn->phb); - if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED)) - return PCIBIOS_SUCCESSFUL; - - if (phb->eeh_state & PNV_EEH_STATE_ENABLED) { - if (*val == EEH_IO_ERROR_VALUE(size) && - eeh_dev_check_failure(of_node_to_eeh_dev(dn))) - return PCIBIOS_DEVICE_NOT_FOUND; - } else { - pnv_pci_config_check_eeh(phb, dn); - } -#else - pnv_pci_config_check_eeh(phb, dn); -#endif - return PCIBIOS_SUCCESSFUL; } @@ -353,33 +446,74 @@ int pnv_pci_cfg_write(struct device_node *dn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - /* Check if the PHB got frozen due to an error (no response) */ -#ifdef CONFIG_EEH - if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED)) - pnv_pci_config_check_eeh(phb, dn); -#else - pnv_pci_config_check_eeh(phb, dn); -#endif - return PCIBIOS_SUCCESSFUL; } +#if CONFIG_EEH +static bool pnv_pci_cfg_check(struct pci_controller *hose, + struct device_node *dn) +{ + struct eeh_dev *edev = NULL; + struct pnv_phb *phb = hose->private_data; + + /* EEH not enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + return true; + + /* PE reset or device removed ? */ + edev = of_node_to_eeh_dev(dn); + if (edev) { + if (edev->pe && + (edev->pe->state & EEH_PE_RESET)) + return false; + + if (edev->mode & EEH_DEV_REMOVED) + return false; + } + + return true; +} +#else +static inline pnv_pci_cfg_check(struct pci_controller *hose, + struct device_node *dn) +{ + return true; +} +#endif /* CONFIG_EEH */ + static int pnv_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); struct pci_dn *pdn; + struct pnv_phb *phb; + bool found = false; + int ret; + *val = 0xFFFFFFFF; for (dn = busdn->child; dn; dn = dn->sibling) { pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn) - return pnv_pci_cfg_read(dn, where, size, val); + if (pdn && pdn->devfn == devfn) { + phb = pdn->phb->private_data; + found = true; + break; + } } - *val = 0xFFFFFFFF; - return PCIBIOS_DEVICE_NOT_FOUND; + if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) + return PCIBIOS_DEVICE_NOT_FOUND; + ret = pnv_pci_cfg_read(dn, where, size, val); + if (phb->flags & PNV_PHB_FLAG_EEH) { + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; + } else { + pnv_pci_config_check_eeh(phb, dn); + } + + return ret; } static int pnv_pci_write_config(struct pci_bus *bus, @@ -388,14 +522,27 @@ static int pnv_pci_write_config(struct pci_bus *bus, { struct device_node *dn, *busdn = pci_bus_to_OF_node(bus); struct pci_dn *pdn; + struct pnv_phb *phb; + bool found = false; + int ret; for (dn = busdn->child; dn; dn = dn->sibling) { pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn) - return pnv_pci_cfg_write(dn, where, size, val); + if (pdn && pdn->devfn == devfn) { + phb = pdn->phb->private_data; + found = true; + break; + } } - return PCIBIOS_DEVICE_NOT_FOUND; + if (!found || !pnv_pci_cfg_check(pdn->phb, dn)) + return PCIBIOS_DEVICE_NOT_FOUND; + + ret = pnv_pci_cfg_write(dn, where, size, val); + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + pnv_pci_config_check_eeh(phb, dn); + + return ret; } struct pci_ops pnv_pci_ops = { @@ -484,7 +631,8 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, { tbl->it_blocksize = 16; tbl->it_base = (unsigned long)tce_mem; - tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; + tbl->it_offset = dma_offset >> tbl->it_page_shift; tbl->it_index = 0; tbl->it_size = tce_size >> 3; tbl->it_busno = 0; @@ -536,7 +684,7 @@ static void pnv_pci_dma_fallback_setup(struct pci_controller *hose, pdn->iommu_table = pnv_pci_setup_bml_iommu(hose); if (!pdn->iommu_table) return; - set_iommu_table_base(&pdev->dev, pdn->iommu_table); + set_iommu_table_base_and_group(&pdev->dev, pdn->iommu_table); } static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) @@ -553,6 +701,16 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) pnv_pci_dma_fallback_setup(hose, pdev); } +int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + + if (phb && phb->dma_set_mask) + return phb->dma_set_mask(phb, pdev, dma_mask); + return __dma_set_mask(&pdev->dev, dma_mask); +} + void pnv_pci_shutdown(void) { struct pci_controller *hose; @@ -657,3 +815,32 @@ void __init pnv_pci_init(void) ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; #endif } + +static int tce_iommu_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + return iommu_add_device(dev); + case BUS_NOTIFY_DEL_DEVICE: + if (dev->iommu_group) + iommu_del_device(dev); + return 0; + default: + return 0; + } +} + +static struct notifier_block tce_iommu_bus_nb = { + .notifier_call = tce_iommu_bus_notifier, +}; + +static int __init tce_iommu_bus_notifier_init(void) +{ + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); + return 0; +} + +subsys_initcall_sync(tce_iommu_bus_notifier_init); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 1ed8d5f40f5a..676232c34328 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -54,7 +54,9 @@ struct pnv_ioda_pe { struct iommu_table tce32_table; phys_addr_t tce_inval_reg_phys; - /* XXX TODO: Add support for additional 64-bit iommus */ + /* 64-bit TCE bypass region */ + bool tce_bypass_enabled; + uint64_t tce_bypass_base; /* MSIs. MVE index is identical for for 32 and 64 bit MSI * and -1 if not supported. (It's actually identical to the @@ -79,28 +81,27 @@ struct pnv_eeh_ops { int (*configure_bridge)(struct eeh_pe *pe); int (*next_error)(struct eeh_pe **pe); }; - -#define PNV_EEH_STATE_ENABLED (1 << 0) /* EEH enabled */ -#define PNV_EEH_STATE_REMOVED (1 << 1) /* PHB removed */ - #endif /* CONFIG_EEH */ +#define PNV_PHB_FLAG_EEH (1 << 0) + struct pnv_phb { struct pci_controller *hose; enum pnv_phb_type type; enum pnv_phb_model model; u64 hub_id; u64 opal_id; + int flags; void __iomem *regs; int initialized; spinlock_t lock; #ifdef CONFIG_EEH struct pnv_eeh_ops *eeh_ops; - int eeh_state; #endif #ifdef CONFIG_DEBUG_FS + int has_dbgfs; struct dentry *dbgfs; #endif @@ -113,6 +114,8 @@ struct pnv_phb { unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); + int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev, + u64 dma_mask); void (*fixup_phb)(struct pci_controller *hose); u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); void (*shutdown)(struct pnv_phb *phb); @@ -176,6 +179,7 @@ struct pnv_phb { union { unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; struct OpalIoP7IOCPhbErrorData p7ioc; + struct OpalIoPhb3ErrorData phb3; struct OpalIoP7IOCErrorData hub_diag; } diag; @@ -186,6 +190,8 @@ extern struct pci_ops pnv_pci_ops; extern struct pnv_eeh_ops ioda_eeh_ops; #endif +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, + unsigned char *log_buff); int pnv_pci_cfg_read(struct device_node *dn, int where, int size, u32 *val); int pnv_pci_cfg_write(struct device_node *dn, @@ -198,5 +204,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, __be64 *startp, __be64 *endp, bool rm); +extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); +extern int ioda_eeh_phb_reset(struct pci_controller *hose, int option); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index de6819be1f95..75501bfede7f 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -7,14 +7,24 @@ extern void pnv_smp_init(void); static inline void pnv_smp_init(void) { } #endif +struct pci_dev; + #ifdef CONFIG_PCI extern void pnv_pci_init(void); extern void pnv_pci_shutdown(void); +extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask); #else static inline void pnv_pci_init(void) { } static inline void pnv_pci_shutdown(void) { } + +static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ + return -ENODEV; +} #endif extern void pnv_lpc_init(void); +bool cpu_core_split_required(void); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 19884b2a51b4..d9b88fa7c5a3 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -26,6 +26,8 @@ #include <linux/of_fdt.h> #include <linux/interrupt.h> #include <linux/bug.h> +#include <linux/pci.h> +#include <linux/cpufreq.h> #include <asm/machdep.h> #include <asm/firmware.h> @@ -33,11 +35,14 @@ #include <asm/rtas.h> #include <asm/opal.h> #include <asm/kexec.h> +#include <asm/smp.h> #include "powernv.h" static void __init pnv_setup_arch(void) { + set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + /* Initialize SMP */ pnv_smp_init(); @@ -97,11 +102,32 @@ static void pnv_show_cpuinfo(struct seq_file *m) of_node_put(root); } +static void pnv_prepare_going_down(void) +{ + /* + * Disable all notifiers from OPAL, we can't + * service interrupts anymore anyway + */ + opal_notifier_disable(); + + /* Soft disable interrupts */ + local_irq_disable(); + + /* + * Return secondary CPUs to firwmare if a flash update + * is pending otherwise we will get all sort of error + * messages about CPU being stuck etc.. This will also + * have the side effect of hard disabling interrupts so + * past this point, the kernel is effectively dead. + */ + opal_flash_term_callback(); +} + static void __noreturn pnv_restart(char *cmd) { long rc = OPAL_BUSY; - opal_notifier_disable(); + pnv_prepare_going_down(); while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_reboot(); @@ -118,7 +144,7 @@ static void __noreturn pnv_power_off(void) { long rc = OPAL_BUSY; - opal_notifier_disable(); + pnv_prepare_going_down(); while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_cec_power_down(0); @@ -140,34 +166,94 @@ static void pnv_progress(char *s, unsigned short hex) { } +static int pnv_dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (dev_is_pci(dev)) + return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask); + return __dma_set_mask(dev, dma_mask); +} + static void pnv_shutdown(void) { /* Let the PCI code clear up IODA tables */ pnv_pci_shutdown(); - /* And unregister all OPAL interrupts so they don't fire - * up while we kexec + /* + * Stop OPAL activity: Unregister all OPAL interrupts so they + * don't fire up while we kexec and make sure all potentially + * DMA'ing ops are complete (such as dump retrieval). */ opal_shutdown(); } #ifdef CONFIG_KEXEC +static void pnv_kexec_wait_secondaries_down(void) +{ + int my_cpu, i, notified = -1; + + my_cpu = get_cpu(); + + for_each_online_cpu(i) { + uint8_t status; + int64_t rc; + + if (i == my_cpu) + continue; + + for (;;) { + rc = opal_query_cpu_status(get_hard_smp_processor_id(i), + &status); + if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED) + break; + barrier(); + if (i != notified) { + printk(KERN_INFO "kexec: waiting for cpu %d " + "(physical %d) to enter OPAL\n", + i, paca[i].hw_cpu_id); + notified = i; + } + } + } +} + static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { xics_kexec_teardown_cpu(secondary); - /* Return secondary CPUs to firmware on OPAL v3 */ - if (firmware_has_feature(FW_FEATURE_OPALv3) && secondary) { + /* On OPAL v3, we return all CPUs to firmware */ + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) + return; + + if (secondary) { + /* Return secondary CPUs to firmware on OPAL v3 */ mb(); get_paca()->kexec_state = KEXEC_STATE_REAL_MODE; mb(); /* Return the CPU to OPAL */ opal_return_cpu(); + } else if (crash_shutdown) { + /* + * On crash, we don't wait for secondaries to go + * down as they might be unreachable or hung, so + * instead we just wait a bit and move on. + */ + mdelay(1); + } else { + /* Primary waits for the secondaries to have reached OPAL */ + pnv_kexec_wait_secondaries_down(); } } #endif /* CONFIG_KEXEC */ +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static unsigned long pnv_memory_block_size(void) +{ + return 256UL * 1024 * 1024; +} +#endif + static void __init pnv_setup_machdep_opal(void) { ppc_md.get_boot_time = opal_get_boot_time; @@ -177,6 +263,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.power_off = pnv_power_off; ppc_md.halt = pnv_halt; ppc_md.machine_check_exception = opal_machine_check; + ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; } #ifdef CONFIG_PPC_POWERNV_RTAS @@ -214,6 +301,25 @@ static int __init pnv_probe(void) return 1; } +/* + * Returns the cpu frequency for 'cpu' in Hz. This is used by + * /proc/cpuinfo + */ +unsigned long pnv_get_proc_freq(unsigned int cpu) +{ + unsigned long ret_freq; + + ret_freq = cpufreq_quick_get(cpu) * 1000ul; + + /* + * If the backend cpufreq driver does not exist, + * then fallback to old way of reporting the clockrate. + */ + if (!ret_freq) + ret_freq = ppc_proc_freq; + return ret_freq; +} + define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, @@ -221,11 +327,16 @@ define_machine(powernv) { .setup_arch = pnv_setup_arch, .init_IRQ = pnv_init_IRQ, .show_cpuinfo = pnv_show_cpuinfo, + .get_proc_freq = pnv_get_proc_freq, .progress = pnv_progress, .machine_shutdown = pnv_shutdown, .power_save = power7_idle, .calibrate_decr = generic_calibrate_decr, + .dma_set_mask = pnv_dma_set_mask, #ifdef CONFIG_KEXEC .kexec_cpu_down = pnv_kexec_cpu_down, #endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE + .memory_block_size = pnv_memory_block_size, +#endif }; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 908672bdcea6..5fcfcf44e3a9 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -30,6 +30,9 @@ #include <asm/cputhreads.h> #include <asm/xics.h> #include <asm/opal.h> +#include <asm/runlatch.h> +#include <asm/code-patching.h> +#include <asm/dbell.h> #include "powernv.h" @@ -44,13 +47,18 @@ static void pnv_smp_setup_cpu(int cpu) { if (cpu != boot_cpuid) xics_setup_cpu(); + +#ifdef CONFIG_PPC_DOORBELL + if (cpu_has_feature(CPU_FTR_DBELL)) + doorbell_setup_this_cpu(); +#endif } int pnv_smp_kick_cpu(int nr) { unsigned int pcpu = get_hard_smp_processor_id(nr); - unsigned long start_here = __pa(*((unsigned long *) - generic_secondary_smp_init)); + unsigned long start_here = + __pa(ppc_function_entry(generic_secondary_smp_init)); long rc; BUG_ON(nr < 0 || nr >= NR_CPUS); @@ -156,16 +164,20 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { - power7_nap(); - if (!generic_check_cpu_restart(cpu)) { + ppc64_runlatch_off(); + power7_nap(1); + ppc64_runlatch_on(); + + /* Reenable IRQs briefly to clear the IPI that woke us */ + local_irq_enable(); + local_irq_disable(); + mb(); + + if (cpu_core_split_required()) + continue; + + if (!generic_check_cpu_restart(cpu)) DBG("CPU%d Unexpected exit while offline !\n", cpu); - /* We may be getting an IPI, so we re-enable - * interrupts to process it, it will be ignored - * since we aren't online (hopefully) - */ - local_irq_enable(); - local_irq_disable(); - } } mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); DBG("CPU%d coming online...\n", cpu); diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S new file mode 100644 index 000000000000..39bb24aa8f34 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore-asm.S @@ -0,0 +1,95 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/reg.h> + +#include "subcore.h" + + +_GLOBAL(split_core_secondary_loop) + /* + * r3 = u8 *state, used throughout the routine + * r4 = temp + * r5 = temp + * .. + * r12 = MSR + */ + mfmsr r12 + + /* Disable interrupts so SRR0/1 don't get trashed */ + li r4,0 + ori r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI + andc r4,r12,r4 + sync + mtmsrd r4 + + /* Switch to real mode and leave interrupts off */ + li r5, MSR_IR|MSR_DR + andc r5, r4, r5 + + LOAD_REG_ADDR(r4, real_mode) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + rfid + b . /* prevent speculative execution */ + +real_mode: + /* Grab values from unsplit SPRs */ + mfspr r6, SPRN_LDBAR + mfspr r7, SPRN_PMMAR + mfspr r8, SPRN_PMCR + mfspr r9, SPRN_RPR + mfspr r10, SPRN_SDR1 + + /* Order reading the SPRs vs telling the primary we are ready to split */ + sync + + /* Tell thread 0 we are in real mode */ + li r4, SYNC_STEP_REAL_MODE + stb r4, 0(r3) + + li r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest + sldi r5, r5, 48 + + /* Loop until we see the split happen in HID0 */ +1: mfspr r4, SPRN_HID0 + and. r4, r4, r5 + beq 1b + + /* + * We only need to initialise the below regs once for each subcore, + * but it's simpler and harmless to do it on each thread. + */ + + /* Make sure various SPRS have sane values */ + li r4, 0 + mtspr SPRN_LPID, r4 + mtspr SPRN_PCR, r4 + mtspr SPRN_HDEC, r4 + + /* Restore SPR values now we are split */ + mtspr SPRN_LDBAR, r6 + mtspr SPRN_PMMAR, r7 + mtspr SPRN_PMCR, r8 + mtspr SPRN_RPR, r9 + mtspr SPRN_SDR1, r10 + + LOAD_REG_ADDR(r5, virtual_mode) + + /* Get out of real mode */ + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r12 + rfid + b . /* prevent speculative execution */ + +virtual_mode: + blr diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c new file mode 100644 index 000000000000..894ecb3eb596 --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -0,0 +1,392 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "powernv: " fmt + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/device.h> +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/stop_machine.h> + +#include <asm/cputhreads.h> +#include <asm/kvm_ppc.h> +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/smp.h> + +#include "subcore.h" + + +/* + * Split/unsplit procedure: + * + * A core can be in one of three states, unsplit, 2-way split, and 4-way split. + * + * The mapping to subcores_per_core is simple: + * + * State | subcores_per_core + * ------------|------------------ + * Unsplit | 1 + * 2-way split | 2 + * 4-way split | 4 + * + * The core is split along thread boundaries, the mapping between subcores and + * threads is as follows: + * + * Unsplit: + * ---------------------------- + * Subcore | 0 | + * ---------------------------- + * Thread | 0 1 2 3 4 5 6 7 | + * ---------------------------- + * + * 2-way split: + * ------------------------------------- + * Subcore | 0 | 1 | + * ------------------------------------- + * Thread | 0 1 2 3 | 4 5 6 7 | + * ------------------------------------- + * + * 4-way split: + * ----------------------------------------- + * Subcore | 0 | 1 | 2 | 3 | + * ----------------------------------------- + * Thread | 0 1 | 2 3 | 4 5 | 6 7 | + * ----------------------------------------- + * + * + * Transitions + * ----------- + * + * It is not possible to transition between either of the split states, the + * core must first be unsplit. The legal transitions are: + * + * ----------- --------------- + * | | <----> | 2-way split | + * | | --------------- + * | Unsplit | + * | | --------------- + * | | <----> | 4-way split | + * ----------- --------------- + * + * Unsplitting + * ----------- + * + * Unsplitting is the simpler procedure. It requires thread 0 to request the + * unsplit while all other threads NAP. + * + * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells + * the hardware that if all threads except 0 are napping, the hardware should + * unsplit the core. + * + * Non-zero threads are sent to a NAP loop, they don't exit the loop until they + * see the core unsplit. + * + * Core 0 spins waiting for the hardware to see all the other threads napping + * and perform the unsplit. + * + * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them + * out of NAP. They will then see the core unsplit and exit the NAP loop. + * + * Splitting + * --------- + * + * The basic splitting procedure is fairly straight forward. However it is + * complicated by the fact that after the split occurs, the newly created + * subcores are not in a fully initialised state. + * + * Most notably the subcores do not have the correct value for SDR1, which + * means they must not be running in virtual mode when the split occurs. The + * subcores have separate timebases SPRs but these are pre-synchronised by + * opal. + * + * To begin with secondary threads are sent to an assembly routine. There they + * switch to real mode, so they are immune to the uninitialised SDR1 value. + * Once in real mode they indicate that they are in real mode, and spin waiting + * to see the core split. + * + * Thread 0 waits to see that all secondaries are in real mode, and then begins + * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which + * prevents the hardware from unsplitting. Then it sets the appropriate HID bit + * to request the split, and spins waiting to see that the split has happened. + * + * Concurrently the secondaries will notice the split. When they do they set up + * their SPRs, notably SDR1, and then they can return to virtual mode and exit + * the procedure. + */ + +/* Initialised at boot by subcore_init() */ +static int subcores_per_core; + +/* + * Used to communicate to offline cpus that we want them to pop out of the + * offline loop and do a split or unsplit. + * + * 0 - no split happening + * 1 - unsplit in progress + * 2 - split to 2 in progress + * 4 - split to 4 in progress + */ +static int new_split_mode; + +static cpumask_var_t cpu_offline_mask; + +struct split_state { + u8 step; + u8 master; +}; + +static DEFINE_PER_CPU(struct split_state, split_state); + +static void wait_for_sync_step(int step) +{ + int i, cpu = smp_processor_id(); + + for (i = cpu + 1; i < cpu + threads_per_core; i++) + while(per_cpu(split_state, i).step < step) + barrier(); + + /* Order the wait loop vs any subsequent loads/stores. */ + mb(); +} + +static void unsplit_core(void) +{ + u64 hid0, mask; + int i, cpu; + + mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + while (mfspr(SPRN_HID0) & mask) + power7_nap(0); + + per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; + return; + } + + hid0 = mfspr(SPRN_HID0); + hid0 &= ~HID0_POWER8_DYNLPARDIS; + mtspr(SPRN_HID0, hid0); + + while (mfspr(SPRN_HID0) & mask) + cpu_relax(); + + /* Wake secondaries out of NAP */ + for (i = cpu + 1; i < cpu + threads_per_core; i++) + smp_send_reschedule(i); + + wait_for_sync_step(SYNC_STEP_UNSPLIT); +} + +static void split_core(int new_mode) +{ + struct { u64 value; u64 mask; } split_parms[2] = { + { HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE }, + { HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE } + }; + int i, cpu; + u64 hid0; + + /* Convert new_mode (2 or 4) into an index into our parms array */ + i = (new_mode >> 1) - 1; + BUG_ON(i < 0 || i > 1); + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + split_core_secondary_loop(&per_cpu(split_state, cpu).step); + return; + } + + wait_for_sync_step(SYNC_STEP_REAL_MODE); + + /* Write new mode */ + hid0 = mfspr(SPRN_HID0); + hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value; + mtspr(SPRN_HID0, hid0); + + /* Wait for it to happen */ + while (!(mfspr(SPRN_HID0) & split_parms[i].mask)) + cpu_relax(); +} + +static void cpu_do_split(int new_mode) +{ + /* + * At boot subcores_per_core will be 0, so we will always unsplit at + * boot. In the usual case where the core is already unsplit it's a + * nop, and this just ensures the kernel's notion of the mode is + * consistent with the hardware. + */ + if (subcores_per_core != 1) + unsplit_core(); + + if (new_mode != 1) + split_core(new_mode); + + mb(); + per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED; +} + +bool cpu_core_split_required(void) +{ + smp_rmb(); + + if (!new_split_mode) + return false; + + cpu_do_split(new_split_mode); + + return true; +} + +static int cpu_update_split_mode(void *data) +{ + int cpu, new_mode = *(int *)data; + + if (this_cpu_ptr(&split_state)->master) { + new_split_mode = new_mode; + smp_wmb(); + + cpumask_andnot(cpu_offline_mask, cpu_present_mask, + cpu_online_mask); + + /* This should work even though the cpu is offline */ + for_each_cpu(cpu, cpu_offline_mask) + smp_send_reschedule(cpu); + } + + cpu_do_split(new_mode); + + if (this_cpu_ptr(&split_state)->master) { + /* Wait for all cpus to finish before we touch subcores_per_core */ + for_each_present_cpu(cpu) { + if (cpu >= setup_max_cpus) + break; + + while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED) + barrier(); + } + + new_split_mode = 0; + + /* Make the new mode public */ + subcores_per_core = new_mode; + threads_per_subcore = threads_per_core / subcores_per_core; + + /* Make sure the new mode is written before we exit */ + mb(); + } + + return 0; +} + +static int set_subcores_per_core(int new_mode) +{ + struct split_state *state; + int cpu; + + if (kvm_hv_mode_active()) { + pr_err("Unable to change split core mode while KVM active.\n"); + return -EBUSY; + } + + /* + * We are only called at boot, or from the sysfs write. If that ever + * changes we'll need a lock here. + */ + BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3); + + for_each_present_cpu(cpu) { + state = &per_cpu(split_state, cpu); + state->step = SYNC_STEP_INITIAL; + state->master = 0; + } + + get_online_cpus(); + + /* This cpu will update the globals before exiting stop machine */ + this_cpu_ptr(&split_state)->master = 1; + + /* Ensure state is consistent before we call the other cpus */ + mb(); + + stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + + put_online_cpus(); + + return 0; +} + +static ssize_t __used store_subcores_per_core(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + unsigned long val; + int rc; + + /* We are serialised by the attribute lock */ + + rc = sscanf(buf, "%lx", &val); + if (rc != 1) + return -EINVAL; + + switch (val) { + case 1: + case 2: + case 4: + if (subcores_per_core == val) + /* Nothing to do */ + goto out; + break; + default: + return -EINVAL; + } + + rc = set_subcores_per_core(val); + if (rc) + return rc; + +out: + return count; +} + +static ssize_t show_subcores_per_core(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%x\n", subcores_per_core); +} + +static DEVICE_ATTR(subcores_per_core, 0644, + show_subcores_per_core, store_subcores_per_core); + +static int subcore_init(void) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return 0; + + /* + * We need all threads in a core to be present to split/unsplit so + * continue only if max_cpus are aligned to threads_per_core. + */ + if (setup_max_cpus % threads_per_core) + return 0; + + BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL)); + + set_subcores_per_core(1); + + return device_create_file(cpu_subsys.dev_root, + &dev_attr_subcores_per_core); +} +machine_device_initcall(powernv, subcore_init); diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h new file mode 100644 index 000000000000..148abc91debf --- /dev/null +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -0,0 +1,18 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* These are ordered and tested with <= */ +#define SYNC_STEP_INITIAL 0 +#define SYNC_STEP_UNSPLIT 1 /* Set by secondary when it sees unsplit */ +#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ +#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ + +#ifndef __ASSEMBLY__ +void split_core_secondary_loop(u8 *state); +#endif |