70 files changed, 5440 insertions, 2080 deletions
diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig
index ad2bb1408b4c..6da813b65b42 100644
--- a/arch/powerpc/platforms/40x/Kconfig
+++ b/arch/powerpc/platforms/40x/Kconfig
@@ -16,12 +16,12 @@ config EP405
 	  This option enables support for the EP405/EP405PC boards.
 
 config HOTFOOT
-        bool "Hotfoot"
+	bool "Hotfoot"
 	depends on 40x
 	select PPC40x_SIMPLE
 	select FORCE_PCI
-        help
-	 This option enables support for the ESTEEM 195E Hotfoot board.
+	help
+	  This option enables support for the ESTEEM 195E Hotfoot board.
 
 config KILAUEA
 	bool "Kilauea"
@@ -80,7 +80,6 @@ config OBS600
 	help
 	  This option enables support for PlatHome OpenBlockS 600 server
 
-
 config PPC40x_SIMPLE
 	bool "Simple PowerPC 40x board support"
 	depends on 40x
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 35be81fd2dc2..25ebe634a661 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -40,12 +40,12 @@ config EBONY
 	  This option enables support for the IBM PPC440GP evaluation board.
 
 config SAM440EP
-        bool "Sam440ep"
+	bool "Sam440ep"
 	depends on 44x
-        select 440EP
-        select FORCE_PCI
-        help
-          This option enables support for the ACube Sam440ep board.
+	select 440EP
+	select FORCE_PCI
+	help
+	  This option enables support for the ACube Sam440ep board.
 
 config SEQUOIA
 	bool "Sequoia"
@@ -272,14 +272,6 @@ config PPC4xx_GPIO
 	help
 	  Enable gpiolib support for ppc440 based boards
 
-config PPC4xx_OCM
-	bool "PPC4xx On Chip Memory (OCM) support"
-	depends on 4xx
-	select PPC_LIB_RHEAP
-	help
-	  Enable OCM support for PowerPC 4xx platforms with on chip memory,
-	  OCM provides the fast place for memory access to improve performance.
-
 # 44x specific CPU modules, selected based on the board above.
 config 440EP
 	bool
diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile
index f5ae27ca131b..d009d2e0b9e8 100644
--- a/arch/powerpc/platforms/4xx/Makefile
+++ b/arch/powerpc/platforms/4xx/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y				+= uic.o machine_check.o
-obj-$(CONFIG_PPC4xx_OCM)	+= ocm.o
 obj-$(CONFIG_4xx_SOC)		+= soc.o
 obj-$(CONFIG_PCI)		+= pci.o
 obj-$(CONFIG_PPC4xx_HSTA_MSI)	+= hsta_msi.o
diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c
deleted file mode 100644
index ba3257406ced..000000000000
--- a/arch/powerpc/platforms/4xx/ocm.c
+++ /dev/null
@@ -1,390 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * PowerPC 4xx OCM memory allocation support
- *
- * (C) Copyright 2009, Applied Micro Circuits Corporation
- * Victor Gallardo (vgallardo@amcc.com)
- *
- * See file CREDITS for list of people who contributed to this
- * project.
- */
-
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <asm/rheap.h>
-#include <asm/ppc4xx_ocm.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-
-#define OCM_DISABLED	0
-#define OCM_ENABLED		1
-
-struct ocm_block {
-	struct list_head	list;
-	void __iomem		*addr;
-	int					size;
-	const char			*owner;
-};
-
-/* non-cached or cached region */
-struct ocm_region {
-	phys_addr_t			phys;
-	void __iomem		*virt;
-
-	int					memtotal;
-	int					memfree;
-
-	rh_info_t			*rh;
-	struct list_head	list;
-};
-
-struct ocm_info {
-	int					index;
-	int					status;
-	int					ready;
-
-	phys_addr_t			phys;
-
-	int					alignment;
-	int					memtotal;
-	int					cache_size;
-
-	struct ocm_region	nc;	/* non-cached region */
-	struct ocm_region	c;	/* cached region */
-};
-
-static struct ocm_info *ocm_nodes;
-static int ocm_count;
-
-static struct ocm_info *ocm_get_node(unsigned int index)
-{
-	if (index >= ocm_count) {
-		printk(KERN_ERR "PPC4XX OCM: invalid index");
-		return NULL;
-	}
-
-	return &ocm_nodes[index];
-}
-
-static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr)
-{
-	struct ocm_block *blk, *tmp;
-	unsigned long offset;
-
-	if (!ocm_reg->virt)
-		return 0;
-
-	list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) {
-		if (blk->addr == addr) {
-			offset = addr - ocm_reg->virt;
-			ocm_reg->memfree += blk->size;
-			rh_free(ocm_reg->rh, offset);
-			list_del(&blk->list);
-			kfree(blk);
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-static void __init ocm_init_node(int count, struct device_node *node)
-{
-	struct ocm_info *ocm;
-
-	const unsigned int *cell_index;
-	const unsigned int *cache_size;
-	int len;
-
-	struct resource rsrc;
-
-	ocm = ocm_get_node(count);
-
-	cell_index = of_get_property(node, "cell-index", &len);
-	if (!cell_index) {
-		printk(KERN_ERR "PPC4XX OCM: missing cell-index property");
-		return;
-	}
-	ocm->index = *cell_index;
-
-	if (of_device_is_available(node))
-		ocm->status = OCM_ENABLED;
-
-	cache_size = of_get_property(node, "cached-region-size", &len);
-	if (cache_size)
-		ocm->cache_size = *cache_size;
-
-	if (of_address_to_resource(node, 0, &rsrc)) {
-		printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n",
-			ocm->index);
-		return;
-	}
-
-	ocm->phys = rsrc.start;
-	ocm->memtotal = (rsrc.end - rsrc.start + 1);
-
-	printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n",
-		ocm->index, ocm->memtotal,
-		(ocm->status == OCM_DISABLED) ? "disabled" : "enabled");
-
-	if (ocm->status == OCM_DISABLED)
-		return;
-
-	/* request region */
-
-	if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) {
-		printk(KERN_ERR "PPC4XX OCM%d: could not request region\n",
-			ocm->index);
-		return;
-	}
-
-	/* Configure non-cached and cached regions */
-
-	ocm->nc.phys = ocm->phys;
-	ocm->nc.memtotal = ocm->memtotal - ocm->cache_size;
-	ocm->nc.memfree = ocm->nc.memtotal;
-
-	ocm->c.phys = ocm->phys + ocm->nc.memtotal;
-	ocm->c.memtotal = ocm->cache_size;
-	ocm->c.memfree = ocm->c.memtotal;
-
-	if (ocm->nc.memtotal == 0)
-		ocm->nc.phys = 0;
-
-	if (ocm->c.memtotal == 0)
-		ocm->c.phys = 0;
-
-	printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n",
-		ocm->index, ocm->nc.memtotal);
-
-	printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n",
-		ocm->index, ocm->c.memtotal);
-
-	/* ioremap the non-cached region */
-	if (ocm->nc.memtotal) {
-		ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal,
-			_PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG));
-
-		if (!ocm->nc.virt) {
-			printk(KERN_ERR
-			       "PPC4XX OCM%d: failed to ioremap non-cached memory\n",
-			       ocm->index);
-			ocm->nc.memfree = 0;
-			return;
-		}
-	}
-
-	/* ioremap the cached region */
-
-	if (ocm->c.memtotal) {
-		ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal,
-					_PAGE_EXEC | pgprot_val(PAGE_KERNEL));
-
-		if (!ocm->c.virt) {
-			printk(KERN_ERR
-			       "PPC4XX OCM%d: failed to ioremap cached memory\n",
-			       ocm->index);
-			ocm->c.memfree = 0;
-			return;
-		}
-	}
-
-	/* Create Remote Heaps */
-
-	ocm->alignment = 4; /* default 4 byte alignment */
-
-	if (ocm->nc.virt) {
-		ocm->nc.rh = rh_create(ocm->alignment);
-		rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal);
-	}
-
-	if (ocm->c.virt) {
-		ocm->c.rh = rh_create(ocm->alignment);
-		rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal);
-	}
-
-	INIT_LIST_HEAD(&ocm->nc.list);
-	INIT_LIST_HEAD(&ocm->c.list);
-
-	ocm->ready = 1;
-}
-
-static int ocm_debugfs_show(struct seq_file *m, void *v)
-{
-	struct ocm_block *blk, *tmp;
-	unsigned int i;
-
-	for (i = 0; i < ocm_count; i++) {
-		struct ocm_info *ocm = ocm_get_node(i);
-
-		if (!ocm || !ocm->ready)
-			continue;
-
-		seq_printf(m, "PPC4XX OCM   : %d\n", ocm->index);
-		seq_printf(m, "PhysAddr     : %pa\n", &(ocm->phys));
-		seq_printf(m, "MemTotal     : %d Bytes\n", ocm->memtotal);
-		seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal);
-		seq_printf(m, "MemTotal(C)  : %d Bytes\n\n", ocm->c.memtotal);
-
-		seq_printf(m, "NC.PhysAddr  : %pa\n", &(ocm->nc.phys));
-		seq_printf(m, "NC.VirtAddr  : 0x%p\n", ocm->nc.virt);
-		seq_printf(m, "NC.MemTotal  : %d Bytes\n", ocm->nc.memtotal);
-		seq_printf(m, "NC.MemFree   : %d Bytes\n", ocm->nc.memfree);
-
-		list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) {
-			seq_printf(m, "NC.MemUsed   : %d Bytes (%s)\n",
-							blk->size, blk->owner);
-		}
-
-		seq_printf(m, "\nC.PhysAddr   : %pa\n", &(ocm->c.phys));
-		seq_printf(m, "C.VirtAddr   : 0x%p\n", ocm->c.virt);
-		seq_printf(m, "C.MemTotal   : %d Bytes\n", ocm->c.memtotal);
-		seq_printf(m, "C.MemFree    : %d Bytes\n", ocm->c.memfree);
-
-		list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) {
-			seq_printf(m, "C.MemUsed    : %d Bytes (%s)\n",
-						blk->size, blk->owner);
-		}
-
-		seq_putc(m, '\n');
-	}
-
-	return 0;
-}
-
-static int ocm_debugfs_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ocm_debugfs_show, NULL);
-}
-
-static const struct file_operations ocm_debugfs_fops = {
-	.open = ocm_debugfs_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int ocm_debugfs_init(void)
-{
-	struct dentry *junk;
-
-	junk = debugfs_create_dir("ppc4xx_ocm", 0);
-	if (!junk) {
-		printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n");
-		return -1;
-	}
-
-	if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) {
-		printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n");
-		return -1;
-	}
-
-	return 0;
-}
-
-void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
-			int flags, const char *owner)
-{
-	void __iomem *addr = NULL;
-	unsigned long offset;
-	struct ocm_info *ocm;
-	struct ocm_region *ocm_reg;
-	struct ocm_block *ocm_blk;
-	int i;
-
-	for (i = 0; i < ocm_count; i++) {
-		ocm = ocm_get_node(i);
-
-		if (!ocm || !ocm->ready)
-			continue;
-
-		if (flags == PPC4XX_OCM_NON_CACHED)
-			ocm_reg = &ocm->nc;
-		else
-			ocm_reg = &ocm->c;
-
-		if (!ocm_reg->virt)
-			continue;
-
-		if (align < ocm->alignment)
-			align = ocm->alignment;
-
-		offset = rh_alloc_align(ocm_reg->rh, size, align, NULL);
-
-		if (IS_ERR_VALUE(offset))
-			continue;
-
-		ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL);
-		if (!ocm_blk) {
-			rh_free(ocm_reg->rh, offset);
-			break;
-		}
-
-		*phys = ocm_reg->phys + offset;
-		addr = ocm_reg->virt + offset;
-		size = ALIGN(size, align);
-
-		ocm_blk->addr = addr;
-		ocm_blk->size = size;
-		ocm_blk->owner = owner;
-		list_add_tail(&ocm_blk->list, &ocm_reg->list);
-
-		ocm_reg->memfree -= size;
-
-		break;
-	}
-
-	return addr;
-}
-
-void ppc4xx_ocm_free(const void *addr)
-{
-	int i;
-
-	if (!addr)
-		return;
-
-	for (i = 0; i < ocm_count; i++) {
-		struct ocm_info *ocm = ocm_get_node(i);
-
-		if (!ocm || !ocm->ready)
-			continue;
-
-		if (ocm_free_region(&ocm->nc, addr) ||
-			ocm_free_region(&ocm->c, addr))
-			return;
-	}
-}
-
-static int __init ppc4xx_ocm_init(void)
-{
-	struct device_node *np;
-	int count;
-
-	count = 0;
-	for_each_compatible_node(np, NULL, "ibm,ocm")
-		count++;
-
-	if (!count)
-		return 0;
-
-	ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL);
-	if (!ocm_nodes)
-		return -ENOMEM;
-
-	ocm_count = count;
-	count = 0;
-
-	for_each_compatible_node(np, NULL, "ibm,ocm") {
-		ocm_init_node(count, np);
-		count++;
-	}
-
-	ocm_debugfs_init();
-
-	return 0;
-}
-
-arch_initcall(ppc4xx_ocm_init);
diff --git a/arch/powerpc/platforms/4xx/uic.c b/arch/powerpc/platforms/4xx/uic.c
index 31f12ad37a98..36fb66ce54cf 100644
--- a/arch/powerpc/platforms/4xx/uic.c
+++ b/arch/powerpc/platforms/4xx/uic.c
@@ -154,6 +154,7 @@ static int uic_set_irq_type(struct irq_data *d, unsigned int flow_type)
 
 	mtdcr(uic->dcrbase + UIC_PR, pr);
 	mtdcr(uic->dcrbase + UIC_TR, tr);
+	mtdcr(uic->dcrbase + UIC_SR, ~mask);
 
 	raw_spin_unlock_irqrestore(&uic->lock, flags);
 
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index d1af0ee2f8c8..fa3d29dcb57e 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -147,10 +147,10 @@ config SOCRATES
 	  This option enables support for the Socrates board.
 
 config KSI8560
-        bool "Emerson KSI8560"
-        select DEFAULT_UIMAGE
-        help
-          This option enables support for the Emerson KSI8560 board
+	bool "Emerson KSI8560"
+	select DEFAULT_UIMAGE
+	help
+	  This option enables support for the Emerson KSI8560 board
 
 config XES_MPC85xx
 	bool "X-ES single-board computer"
diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig
index 0a610114bc38..07a9d60c618a 100644
--- a/arch/powerpc/platforms/86xx/Kconfig
+++ b/arch/powerpc/platforms/86xx/Kconfig
@@ -62,9 +62,9 @@ config GEF_SBC610
 	  This option enables support for the GE SBC610.
 
 config MVME7100
-        bool "Artesyn MVME7100"
-        help
-          This option enables support for the Emerson/Artesyn MVME7100 board.
+	bool "Artesyn MVME7100"
+	help
+	  This option enables support for the Emerson/Artesyn MVME7100 board.
 
 endif
 
diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig
index d408162d5af4..e0fe670f06f6 100644
--- a/arch/powerpc/platforms/8xx/Kconfig
+++ b/arch/powerpc/platforms/8xx/Kconfig
@@ -157,6 +157,13 @@ config I2C_SPI_SMC1_UCODE_PATCH
 	help
 	  Help not implemented yet, coming soon.
 
+config SMC_UCODE_PATCH
+	bool "SMC relocation patch"
+	help
+	  This microcode relocates SMC1 and SMC2 parameter RAMs at
+	  offset 0x1ec0 and 0x1fc0 to allow extended parameter RAM
+	  for SCC3 and SCC4.
+
 endchoice
 
 config UCODE_PATCH
diff --git a/arch/powerpc/platforms/8xx/Makefile b/arch/powerpc/platforms/8xx/Makefile
index 708ab099e886..27a7c6f828e0 100644
--- a/arch/powerpc/platforms/8xx/Makefile
+++ b/arch/powerpc/platforms/8xx/Makefile
@@ -3,6 +3,8 @@
 # Makefile for the PowerPC 8xx linux kernel.
 #
 obj-y			+= m8xx_setup.o machine_check.o pic.o
+obj-$(CONFIG_CPM1)		+= cpm1.o
+obj-$(CONFIG_UCODE_PATCH)	+= micropatch.o
 obj-$(CONFIG_MPC885ADS)   += mpc885ads_setup.o
 obj-$(CONFIG_MPC86XADS)   += mpc86xads_setup.o
 obj-$(CONFIG_PPC_EP88XC)  += ep88xc.o
diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c
new file mode 100644
index 000000000000..0f65c51271db
--- /dev/null
+++ b/arch/powerpc/platforms/8xx/cpm1.c
@@ -0,0 +1,790 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * General Purpose functions for the global management of the
+ * Communication Processor Module.
+ * Copyright (c) 1997 Dan error_act (dmalek@jlc.net)
+ *
+ * In addition to the individual control of the communication
+ * channels, there are a few functions that globally affect the
+ * communication processor.
+ *
+ * Buffer descriptors must be allocated from the dual ported memory
+ * space.  The allocator for that is here.  When the communication
+ * process is reset, we reclaim the memory available.  There is
+ * currently no deallocator for this memory.
+ * The amount of space available is platform dependent.  On the
+ * MBX, the EPPC software loads additional microcode into the
+ * communication processor, and uses some of the DP ram for this
+ * purpose.  Current, the first 512 bytes and the last 256 bytes of
+ * memory are used.  Right now I am conservative and only use the
+ * memory that can never be used for microcode.  If there are
+ * applications that require more DP ram, we can expand the boundaries
+ * but then we have to be careful of any downloaded microcode.
+ */
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/8xx_immap.h>
+#include <asm/cpm1.h>
+#include <asm/io.h>
+#include <asm/rheap.h>
+#include <asm/prom.h>
+#include <asm/cpm.h>
+
+#include <asm/fs_pd.h>
+
+#ifdef CONFIG_8xx_GPIO
+#include <linux/of_gpio.h>
+#endif
+
+#define CPM_MAP_SIZE    (0x4000)
+
+cpm8xx_t __iomem *cpmp;  /* Pointer to comm processor space */
+immap_t __iomem *mpc8xx_immr;
+static cpic8xx_t __iomem *cpic_reg;
+
+static struct irq_domain *cpm_pic_host;
+
+static void cpm_mask_irq(struct irq_data *d)
+{
+	unsigned int cpm_vec = (unsigned int)irqd_to_hwirq(d);
+
+	clrbits32(&cpic_reg->cpic_cimr, (1 << cpm_vec));
+}
+
+static void cpm_unmask_irq(struct irq_data *d)
+{
+	unsigned int cpm_vec = (unsigned int)irqd_to_hwirq(d);
+
+	setbits32(&cpic_reg->cpic_cimr, (1 << cpm_vec));
+}
+
+static void cpm_end_irq(struct irq_data *d)
+{
+	unsigned int cpm_vec = (unsigned int)irqd_to_hwirq(d);
+
+	out_be32(&cpic_reg->cpic_cisr, (1 << cpm_vec));
+}
+
+static struct irq_chip cpm_pic = {
+	.name = "CPM PIC",
+	.irq_mask = cpm_mask_irq,
+	.irq_unmask = cpm_unmask_irq,
+	.irq_eoi = cpm_end_irq,
+};
+
+int cpm_get_irq(void)
+{
+	int cpm_vec;
+
+	/*
+	 * Get the vector by setting the ACK bit and then reading
+	 * the register.
+	 */
+	out_be16(&cpic_reg->cpic_civr, 1);
+	cpm_vec = in_be16(&cpic_reg->cpic_civr);
+	cpm_vec >>= 11;
+
+	return irq_linear_revmap(cpm_pic_host, cpm_vec);
+}
+
+static int cpm_pic_host_map(struct irq_domain *h, unsigned int virq,
+			  irq_hw_number_t hw)
+{
+	pr_debug("cpm_pic_host_map(%d, 0x%lx)\n", virq, hw);
+
+	irq_set_status_flags(virq, IRQ_LEVEL);
+	irq_set_chip_and_handler(virq, &cpm_pic, handle_fasteoi_irq);
+	return 0;
+}
+
+/*
+ * The CPM can generate the error interrupt when there is a race condition
+ * between generating and masking interrupts.  All we have to do is ACK it
+ * and return.  This is a no-op function so we don't need any special
+ * tests in the interrupt handler.
+ */
+static irqreturn_t cpm_error_interrupt(int irq, void *dev)
+{
+	return IRQ_HANDLED;
+}
+
+static struct irqaction cpm_error_irqaction = {
+	.handler = cpm_error_interrupt,
+	.flags = IRQF_NO_THREAD,
+	.name = "error",
+};
+
+static const struct irq_domain_ops cpm_pic_host_ops = {
+	.map = cpm_pic_host_map,
+};
+
+unsigned int cpm_pic_init(void)
+{
+	struct device_node *np = NULL;
+	struct resource res;
+	unsigned int sirq = 0, hwirq, eirq;
+	int ret;
+
+	pr_debug("cpm_pic_init\n");
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,cpm1-pic");
+	if (np == NULL)
+		np = of_find_compatible_node(NULL, "cpm-pic", "CPM");
+	if (np == NULL) {
+		printk(KERN_ERR "CPM PIC init: can not find cpm-pic node\n");
+		return sirq;
+	}
+
+	ret = of_address_to_resource(np, 0, &res);
+	if (ret)
+		goto end;
+
+	cpic_reg = ioremap(res.start, resource_size(&res));
+	if (cpic_reg == NULL)
+		goto end;
+
+	sirq = irq_of_parse_and_map(np, 0);
+	if (!sirq)
+		goto end;
+
+	/* Initialize the CPM interrupt controller. */
+	hwirq = (unsigned int)virq_to_hw(sirq);
+	out_be32(&cpic_reg->cpic_cicr,
+	    (CICR_SCD_SCC4 | CICR_SCC_SCC3 | CICR_SCB_SCC2 | CICR_SCA_SCC1) |
+		((hwirq/2) << 13) | CICR_HP_MASK);
+
+	out_be32(&cpic_reg->cpic_cimr, 0);
+
+	cpm_pic_host = irq_domain_add_linear(np, 64, &cpm_pic_host_ops, NULL);
+	if (cpm_pic_host == NULL) {
+		printk(KERN_ERR "CPM2 PIC: failed to allocate irq host!\n");
+		sirq = 0;
+		goto end;
+	}
+
+	/* Install our own error handler. */
+	np = of_find_compatible_node(NULL, NULL, "fsl,cpm1");
+	if (np == NULL)
+		np = of_find_node_by_type(NULL, "cpm");
+	if (np == NULL) {
+		printk(KERN_ERR "CPM PIC init: can not find cpm node\n");
+		goto end;
+	}
+
+	eirq = irq_of_parse_and_map(np, 0);
+	if (!eirq)
+		goto end;
+
+	if (setup_irq(eirq, &cpm_error_irqaction))
+		printk(KERN_ERR "Could not allocate CPM error IRQ!");
+
+	setbits32(&cpic_reg->cpic_cicr, CICR_IEN);
+
+end:
+	of_node_put(np);
+	return sirq;
+}
+
+void __init cpm_reset(void)
+{
+	sysconf8xx_t __iomem *siu_conf;
+
+	mpc8xx_immr = ioremap(get_immrbase(), 0x4000);
+	if (!mpc8xx_immr) {
+		printk(KERN_CRIT "Could not map IMMR\n");
+		return;
+	}
+
+	cpmp = &mpc8xx_immr->im_cpm;
+
+#ifndef CONFIG_PPC_EARLY_DEBUG_CPM
+	/* Perform a reset. */
+	out_be16(&cpmp->cp_cpcr, CPM_CR_RST | CPM_CR_FLG);
+
+	/* Wait for it. */
+	while (in_be16(&cpmp->cp_cpcr) & CPM_CR_FLG);
+#endif
+
+#ifdef CONFIG_UCODE_PATCH
+	cpm_load_patch(cpmp);
+#endif
+
+	/*
+	 * Set SDMA Bus Request priority 5.
+	 * On 860T, this also enables FEC priority 6.  I am not sure
+	 * this is what we really want for some applications, but the
+	 * manual recommends it.
+	 * Bit 25, FAM can also be set to use FEC aggressive mode (860T).
+	 */
+	siu_conf = immr_map(im_siu_conf);
+	if ((mfspr(SPRN_IMMR) & 0xffff) == 0x0900) /* MPC885 */
+		out_be32(&siu_conf->sc_sdcr, 0x40);
+	else
+		out_be32(&siu_conf->sc_sdcr, 1);
+	immr_unmap(siu_conf);
+}
+
+static DEFINE_SPINLOCK(cmd_lock);
+
+#define MAX_CR_CMD_LOOPS        10000
+
+int cpm_command(u32 command, u8 opcode)
+{
+	int i, ret;
+	unsigned long flags;
+
+	if (command & 0xffffff0f)
+		return -EINVAL;
+
+	spin_lock_irqsave(&cmd_lock, flags);
+
+	ret = 0;
+	out_be16(&cpmp->cp_cpcr, command | CPM_CR_FLG | (opcode << 8));
+	for (i = 0; i < MAX_CR_CMD_LOOPS; i++)
+		if ((in_be16(&cpmp->cp_cpcr) & CPM_CR_FLG) == 0)
+			goto out;
+
+	printk(KERN_ERR "%s(): Not able to issue CPM command\n", __func__);
+	ret = -EIO;
+out:
+	spin_unlock_irqrestore(&cmd_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(cpm_command);
+
+/*
+ * Set a baud rate generator.  This needs lots of work.  There are
+ * four BRGs, any of which can be wired to any channel.
+ * The internal baud rate clock is the system clock divided by 16.
+ * This assumes the baudrate is 16x oversampled by the uart.
+ */
+#define BRG_INT_CLK		(get_brgfreq())
+#define BRG_UART_CLK		(BRG_INT_CLK/16)
+#define BRG_UART_CLK_DIV16	(BRG_UART_CLK/16)
+
+void
+cpm_setbrg(uint brg, uint rate)
+{
+	u32 __iomem *bp;
+
+	/* This is good enough to get SMCs running..... */
+	bp = &cpmp->cp_brgc1;
+	bp += brg;
+	/*
+	 * The BRG has a 12-bit counter.  For really slow baud rates (or
+	 * really fast processors), we may have to further divide by 16.
+	 */
+	if (((BRG_UART_CLK / rate) - 1) < 4096)
+		out_be32(bp, (((BRG_UART_CLK / rate) - 1) << 1) | CPM_BRG_EN);
+	else
+		out_be32(bp, (((BRG_UART_CLK_DIV16 / rate) - 1) << 1) |
+			      CPM_BRG_EN | CPM_BRG_DIV16);
+}
+
+struct cpm_ioport16 {
+	__be16 dir, par, odr_sor, dat, intr;
+	__be16 res[3];
+};
+
+struct cpm_ioport32b {
+	__be32 dir, par, odr, dat;
+};
+
+struct cpm_ioport32e {
+	__be32 dir, par, sor, odr, dat;
+};
+
+static void cpm1_set_pin32(int port, int pin, int flags)
+{
+	struct cpm_ioport32e __iomem *iop;
+	pin = 1 << (31 - pin);
+
+	if (port == CPM_PORTB)
+		iop = (struct cpm_ioport32e __iomem *)
+		      &mpc8xx_immr->im_cpm.cp_pbdir;
+	else
+		iop = (struct cpm_ioport32e __iomem *)
+		      &mpc8xx_immr->im_cpm.cp_pedir;
+
+	if (flags & CPM_PIN_OUTPUT)
+		setbits32(&iop->dir, pin);
+	else
+		clrbits32(&iop->dir, pin);
+
+	if (!(flags & CPM_PIN_GPIO))
+		setbits32(&iop->par, pin);
+	else
+		clrbits32(&iop->par, pin);
+
+	if (port == CPM_PORTB) {
+		if (flags & CPM_PIN_OPENDRAIN)
+			setbits16(&mpc8xx_immr->im_cpm.cp_pbodr, pin);
+		else
+			clrbits16(&mpc8xx_immr->im_cpm.cp_pbodr, pin);
+	}
+
+	if (port == CPM_PORTE) {
+		if (flags & CPM_PIN_SECONDARY)
+			setbits32(&iop->sor, pin);
+		else
+			clrbits32(&iop->sor, pin);
+
+		if (flags & CPM_PIN_OPENDRAIN)
+			setbits32(&mpc8xx_immr->im_cpm.cp_peodr, pin);
+		else
+			clrbits32(&mpc8xx_immr->im_cpm.cp_peodr, pin);
+	}
+}
+
+static void cpm1_set_pin16(int port, int pin, int flags)
+{
+	struct cpm_ioport16 __iomem *iop =
+		(struct cpm_ioport16 __iomem *)&mpc8xx_immr->im_ioport;
+
+	pin = 1 << (15 - pin);
+
+	if (port != 0)
+		iop += port - 1;
+
+	if (flags & CPM_PIN_OUTPUT)
+		setbits16(&iop->dir, pin);
+	else
+		clrbits16(&iop->dir, pin);
+
+	if (!(flags & CPM_PIN_GPIO))
+		setbits16(&iop->par, pin);
+	else
+		clrbits16(&iop->par, pin);
+
+	if (port == CPM_PORTA) {
+		if (flags & CPM_PIN_OPENDRAIN)
+			setbits16(&iop->odr_sor, pin);
+		else
+			clrbits16(&iop->odr_sor, pin);
+	}
+	if (port == CPM_PORTC) {
+		if (flags & CPM_PIN_SECONDARY)
+			setbits16(&iop->odr_sor, pin);
+		else
+			clrbits16(&iop->odr_sor, pin);
+		if (flags & CPM_PIN_FALLEDGE)
+			setbits16(&iop->intr, pin);
+		else
+			clrbits16(&iop->intr, pin);
+	}
+}
+
+void cpm1_set_pin(enum cpm_port port, int pin, int flags)
+{
+	if (port == CPM_PORTB || port == CPM_PORTE)
+		cpm1_set_pin32(port, pin, flags);
+	else
+		cpm1_set_pin16(port, pin, flags);
+}
+
+int cpm1_clk_setup(enum cpm_clk_target target, int clock, int mode)
+{
+	int shift;
+	int i, bits = 0;
+	u32 __iomem *reg;
+	u32 mask = 7;
+
+	u8 clk_map[][3] = {
+		{CPM_CLK_SCC1, CPM_BRG1, 0},
+		{CPM_CLK_SCC1, CPM_BRG2, 1},
+		{CPM_CLK_SCC1, CPM_BRG3, 2},
+		{CPM_CLK_SCC1, CPM_BRG4, 3},
+		{CPM_CLK_SCC1, CPM_CLK1, 4},
+		{CPM_CLK_SCC1, CPM_CLK2, 5},
+		{CPM_CLK_SCC1, CPM_CLK3, 6},
+		{CPM_CLK_SCC1, CPM_CLK4, 7},
+
+		{CPM_CLK_SCC2, CPM_BRG1, 0},
+		{CPM_CLK_SCC2, CPM_BRG2, 1},
+		{CPM_CLK_SCC2, CPM_BRG3, 2},
+		{CPM_CLK_SCC2, CPM_BRG4, 3},
+		{CPM_CLK_SCC2, CPM_CLK1, 4},
+		{CPM_CLK_SCC2, CPM_CLK2, 5},
+		{CPM_CLK_SCC2, CPM_CLK3, 6},
+		{CPM_CLK_SCC2, CPM_CLK4, 7},
+
+		{CPM_CLK_SCC3, CPM_BRG1, 0},
+		{CPM_CLK_SCC3, CPM_BRG2, 1},
+		{CPM_CLK_SCC3, CPM_BRG3, 2},
+		{CPM_CLK_SCC3, CPM_BRG4, 3},
+		{CPM_CLK_SCC3, CPM_CLK5, 4},
+		{CPM_CLK_SCC3, CPM_CLK6, 5},
+		{CPM_CLK_SCC3, CPM_CLK7, 6},
+		{CPM_CLK_SCC3, CPM_CLK8, 7},
+
+		{CPM_CLK_SCC4, CPM_BRG1, 0},
+		{CPM_CLK_SCC4, CPM_BRG2, 1},
+		{CPM_CLK_SCC4, CPM_BRG3, 2},
+		{CPM_CLK_SCC4, CPM_BRG4, 3},
+		{CPM_CLK_SCC4, CPM_CLK5, 4},
+		{CPM_CLK_SCC4, CPM_CLK6, 5},
+		{CPM_CLK_SCC4, CPM_CLK7, 6},
+		{CPM_CLK_SCC4, CPM_CLK8, 7},
+
+		{CPM_CLK_SMC1, CPM_BRG1, 0},
+		{CPM_CLK_SMC1, CPM_BRG2, 1},
+		{CPM_CLK_SMC1, CPM_BRG3, 2},
+		{CPM_CLK_SMC1, CPM_BRG4, 3},
+		{CPM_CLK_SMC1, CPM_CLK1, 4},
+		{CPM_CLK_SMC1, CPM_CLK2, 5},
+		{CPM_CLK_SMC1, CPM_CLK3, 6},
+		{CPM_CLK_SMC1, CPM_CLK4, 7},
+
+		{CPM_CLK_SMC2, CPM_BRG1, 0},
+		{CPM_CLK_SMC2, CPM_BRG2, 1},
+		{CPM_CLK_SMC2, CPM_BRG3, 2},
+		{CPM_CLK_SMC2, CPM_BRG4, 3},
+		{CPM_CLK_SMC2, CPM_CLK5, 4},
+		{CPM_CLK_SMC2, CPM_CLK6, 5},
+		{CPM_CLK_SMC2, CPM_CLK7, 6},
+		{CPM_CLK_SMC2, CPM_CLK8, 7},
+	};
+
+	switch (target) {
+	case CPM_CLK_SCC1:
+		reg = &mpc8xx_immr->im_cpm.cp_sicr;
+		shift = 0;
+		break;
+
+	case CPM_CLK_SCC2:
+		reg = &mpc8xx_immr->im_cpm.cp_sicr;
+		shift = 8;
+		break;
+
+	case CPM_CLK_SCC3:
+		reg = &mpc8xx_immr->im_cpm.cp_sicr;
+		shift = 16;
+		break;
+
+	case CPM_CLK_SCC4:
+		reg = &mpc8xx_immr->im_cpm.cp_sicr;
+		shift = 24;
+		break;
+
+	case CPM_CLK_SMC1:
+		reg = &mpc8xx_immr->im_cpm.cp_simode;
+		shift = 12;
+		break;
+
+	case CPM_CLK_SMC2:
+		reg = &mpc8xx_immr->im_cpm.cp_simode;
+		shift = 28;
+		break;
+
+	default:
+		printk(KERN_ERR "cpm1_clock_setup: invalid clock target\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(clk_map); i++) {
+		if (clk_map[i][0] == target && clk_map[i][1] == clock) {
+			bits = clk_map[i][2];
+			break;
+		}
+	}
+
+	if (i == ARRAY_SIZE(clk_map)) {
+		printk(KERN_ERR "cpm1_clock_setup: invalid clock combination\n");
+		return -EINVAL;
+	}
+
+	bits <<= shift;
+	mask <<= shift;
+
+	if (reg == &mpc8xx_immr->im_cpm.cp_sicr) {
+		if (mode == CPM_CLK_RTX) {
+			bits |= bits << 3;
+			mask |= mask << 3;
+		} else if (mode == CPM_CLK_RX) {
+			bits <<= 3;
+			mask <<= 3;
+		}
+	}
+
+	out_be32(reg, (in_be32(reg) & ~mask) | bits);
+
+	return 0;
+}
+
+/*
+ * GPIO LIB API implementation
+ */
+#ifdef CONFIG_8xx_GPIO
+
+struct cpm1_gpio16_chip {
+	struct of_mm_gpio_chip mm_gc;
+	spinlock_t lock;
+
+	/* shadowed data register to clear/set bits safely */
+	u16 cpdata;
+
+	/* IRQ associated with Pins when relevant */
+	int irq[16];
+};
+
+static void cpm1_gpio16_save_regs(struct of_mm_gpio_chip *mm_gc)
+{
+	struct cpm1_gpio16_chip *cpm1_gc =
+		container_of(mm_gc, struct cpm1_gpio16_chip, mm_gc);
+	struct cpm_ioport16 __iomem *iop = mm_gc->regs;
+
+	cpm1_gc->cpdata = in_be16(&iop->dat);
+}
+
+static int cpm1_gpio16_get(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm_ioport16 __iomem *iop = mm_gc->regs;
+	u16 pin_mask;
+
+	pin_mask = 1 << (15 - gpio);
+
+	return !!(in_be16(&iop->dat) & pin_mask);
+}
+
+static void __cpm1_gpio16_set(struct of_mm_gpio_chip *mm_gc, u16 pin_mask,
+	int value)
+{
+	struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	struct cpm_ioport16 __iomem *iop = mm_gc->regs;
+
+	if (value)
+		cpm1_gc->cpdata |= pin_mask;
+	else
+		cpm1_gc->cpdata &= ~pin_mask;
+
+	out_be16(&iop->dat, cpm1_gc->cpdata);
+}
+
+static void cpm1_gpio16_set(struct gpio_chip *gc, unsigned int gpio, int value)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	unsigned long flags;
+	u16 pin_mask = 1 << (15 - gpio);
+
+	spin_lock_irqsave(&cpm1_gc->lock, flags);
+
+	__cpm1_gpio16_set(mm_gc, pin_mask, value);
+
+	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+}
+
+static int cpm1_gpio16_to_irq(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+
+	return cpm1_gc->irq[gpio] ? : -ENXIO;
+}
+
+static int cpm1_gpio16_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	struct cpm_ioport16 __iomem *iop = mm_gc->regs;
+	unsigned long flags;
+	u16 pin_mask = 1 << (15 - gpio);
+
+	spin_lock_irqsave(&cpm1_gc->lock, flags);
+
+	setbits16(&iop->dir, pin_mask);
+	__cpm1_gpio16_set(mm_gc, pin_mask, val);
+
+	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+
+	return 0;
+}
+
+static int cpm1_gpio16_dir_in(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	struct cpm_ioport16 __iomem *iop = mm_gc->regs;
+	unsigned long flags;
+	u16 pin_mask = 1 << (15 - gpio);
+
+	spin_lock_irqsave(&cpm1_gc->lock, flags);
+
+	clrbits16(&iop->dir, pin_mask);
+
+	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+
+	return 0;
+}
+
+int cpm1_gpiochip_add16(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct cpm1_gpio16_chip *cpm1_gc;
+	struct of_mm_gpio_chip *mm_gc;
+	struct gpio_chip *gc;
+	u16 mask;
+
+	cpm1_gc = kzalloc(sizeof(*cpm1_gc), GFP_KERNEL);
+	if (!cpm1_gc)
+		return -ENOMEM;
+
+	spin_lock_init(&cpm1_gc->lock);
+
+	if (!of_property_read_u16(np, "fsl,cpm1-gpio-irq-mask", &mask)) {
+		int i, j;
+
+		for (i = 0, j = 0; i < 16; i++)
+			if (mask & (1 << (15 - i)))
+				cpm1_gc->irq[i] = irq_of_parse_and_map(np, j++);
+	}
+
+	mm_gc = &cpm1_gc->mm_gc;
+	gc = &mm_gc->gc;
+
+	mm_gc->save_regs = cpm1_gpio16_save_regs;
+	gc->ngpio = 16;
+	gc->direction_input = cpm1_gpio16_dir_in;
+	gc->direction_output = cpm1_gpio16_dir_out;
+	gc->get = cpm1_gpio16_get;
+	gc->set = cpm1_gpio16_set;
+	gc->to_irq = cpm1_gpio16_to_irq;
+	gc->parent = dev;
+	gc->owner = THIS_MODULE;
+
+	return of_mm_gpiochip_add_data(np, mm_gc, cpm1_gc);
+}
+
+struct cpm1_gpio32_chip {
+	struct of_mm_gpio_chip mm_gc;
+	spinlock_t lock;
+
+	/* shadowed data register to clear/set bits safely */
+	u32 cpdata;
+};
+
+static void cpm1_gpio32_save_regs(struct of_mm_gpio_chip *mm_gc)
+{
+	struct cpm1_gpio32_chip *cpm1_gc =
+		container_of(mm_gc, struct cpm1_gpio32_chip, mm_gc);
+	struct cpm_ioport32b __iomem *iop = mm_gc->regs;
+
+	cpm1_gc->cpdata = in_be32(&iop->dat);
+}
+
+static int cpm1_gpio32_get(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm_ioport32b __iomem *iop = mm_gc->regs;
+	u32 pin_mask;
+
+	pin_mask = 1 << (31 - gpio);
+
+	return !!(in_be32(&iop->dat) & pin_mask);
+}
+
+static void __cpm1_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask,
+	int value)
+{
+	struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	struct cpm_ioport32b __iomem *iop = mm_gc->regs;
+
+	if (value)
+		cpm1_gc->cpdata |= pin_mask;
+	else
+		cpm1_gc->cpdata &= ~pin_mask;
+
+	out_be32(&iop->dat, cpm1_gc->cpdata);
+}
+
+static void cpm1_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	unsigned long flags;
+	u32 pin_mask = 1 << (31 - gpio);
+
+	spin_lock_irqsave(&cpm1_gc->lock, flags);
+
+	__cpm1_gpio32_set(mm_gc, pin_mask, value);
+
+	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+}
+
+static int cpm1_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	struct cpm_ioport32b __iomem *iop = mm_gc->regs;
+	unsigned long flags;
+	u32 pin_mask = 1 << (31 - gpio);
+
+	spin_lock_irqsave(&cpm1_gc->lock, flags);
+
+	setbits32(&iop->dir, pin_mask);
+	__cpm1_gpio32_set(mm_gc, pin_mask, val);
+
+	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+
+	return 0;
+}
+
+static int cpm1_gpio32_dir_in(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc);
+	struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc);
+	struct cpm_ioport32b __iomem *iop = mm_gc->regs;
+	unsigned long flags;
+	u32 pin_mask = 1 << (31 - gpio);
+
+	spin_lock_irqsave(&cpm1_gc->lock, flags);
+
+	clrbits32(&iop->dir, pin_mask);
+
+	spin_unlock_irqrestore(&cpm1_gc->lock, flags);
+
+	return 0;
+}
+
+int cpm1_gpiochip_add32(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct cpm1_gpio32_chip *cpm1_gc;
+	struct of_mm_gpio_chip *mm_gc;
+	struct gpio_chip *gc;
+
+	cpm1_gc = kzalloc(sizeof(*cpm1_gc), GFP_KERNEL);
+	if (!cpm1_gc)
+		return -ENOMEM;
+
+	spin_lock_init(&cpm1_gc->lock);
+
+	mm_gc = &cpm1_gc->mm_gc;
+	gc = &mm_gc->gc;
+
+	mm_gc->save_regs = cpm1_gpio32_save_regs;
+	gc->ngpio = 32;
+	gc->direction_input = cpm1_gpio32_dir_in;
+	gc->direction_output = cpm1_gpio32_dir_out;
+	gc->get = cpm1_gpio32_get;
+	gc->set = cpm1_gpio32_set;
+	gc->parent = dev;
+	gc->owner = THIS_MODULE;
+
+	return of_mm_gpiochip_add_data(np, mm_gc, cpm1_gc);
+}
+
+#endif /* CONFIG_8xx_GPIO */
diff --git a/arch/powerpc/platforms/8xx/micropatch.c b/arch/powerpc/platforms/8xx/micropatch.c
new file mode 100644
index 000000000000..c80bd7afd6c5
--- /dev/null
+++ b/arch/powerpc/platforms/8xx/micropatch.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Microcode patches for the CPM as supplied by Motorola.
+ * This is the one for IIC/SPI.  There is a newer one that
+ * also relocates SMC2, but this would require additional changes
+ * to uart.c, so I am holding off on that for a moment.
+ */
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/8xx_immap.h>
+#include <asm/cpm.h>
+#include <asm/cpm1.h>
+
+struct patch_params {
+	ushort rccr;
+	ushort cpmcr1;
+	ushort cpmcr2;
+	ushort cpmcr3;
+	ushort cpmcr4;
+};
+
+/*
+ * I2C/SPI relocation patch arrays.
+ */
+
+#ifdef CONFIG_I2C_SPI_UCODE_PATCH
+
+static char patch_name[] __initdata = "I2C/SPI";
+
+static struct patch_params patch_params __initdata = {
+	1, 0x802a, 0x8028, 0x802e, 0x802c,
+};
+
+static uint patch_2000[] __initdata = {
+	0x7FFFEFD9, 0x3FFD0000, 0x7FFB49F7, 0x7FF90000,
+	0x5FEFADF7, 0x5F89ADF7, 0x5FEFAFF7, 0x5F89AFF7,
+	0x3A9CFBC8, 0xE7C0EDF0, 0x77C1E1BB, 0xF4DC7F1D,
+	0xABAD932F, 0x4E08FDCF, 0x6E0FAFF8, 0x7CCF76CF,
+	0xFD1FF9CF, 0xABF88DC6, 0xAB5679F7, 0xB0937383,
+	0xDFCE79F7, 0xB091E6BB, 0xE5BBE74F, 0xB3FA6F0F,
+	0x6FFB76CE, 0xEE0DF9CF, 0x2BFBEFEF, 0xCFEEF9CF,
+	0x76CEAD24, 0x90B2DF9A, 0x7FDDD0BF, 0x4BF847FD,
+	0x7CCF76CE, 0xCFEF7E1F, 0x7F1D7DFD, 0xF0B6EF71,
+	0x7FC177C1, 0xFBC86079, 0xE722FBC8, 0x5FFFDFFF,
+	0x5FB2FFFB, 0xFBC8F3C8, 0x94A67F01, 0x7F1D5F39,
+	0xAFE85F5E, 0xFFDFDF96, 0xCB9FAF7D, 0x5FC1AFED,
+	0x8C1C5FC1, 0xAFDD5FC3, 0xDF9A7EFD, 0xB0B25FB2,
+	0xFFFEABAD, 0x5FB2FFFE, 0x5FCE600B, 0xE6BB600B,
+	0x5FCEDFC6, 0x27FBEFDF, 0x5FC8CFDE, 0x3A9CE7C0,
+	0xEDF0F3C8, 0x7F0154CD, 0x7F1D2D3D, 0x363A7570,
+	0x7E0AF1CE, 0x37EF2E68, 0x7FEE10EC, 0xADF8EFDE,
+	0xCFEAE52F, 0x7D0FE12B, 0xF1CE5F65, 0x7E0A4DF8,
+	0xCFEA5F72, 0x7D0BEFEE, 0xCFEA5F74, 0xE522EFDE,
+	0x5F74CFDA, 0x0B627385, 0xDF627E0A, 0x30D8145B,
+	0xBFFFF3C8, 0x5FFFDFFF, 0xA7F85F5E, 0xBFFE7F7D,
+	0x10D31450, 0x5F36BFFF, 0xAF785F5E, 0xBFFDA7F8,
+	0x5F36BFFE, 0x77FD30C0, 0x4E08FDCF, 0xE5FF6E0F,
+	0xAFF87E1F, 0x7E0FFD1F, 0xF1CF5F1B, 0xABF80D5E,
+	0x5F5EFFEF, 0x79F730A2, 0xAFDD5F34, 0x47F85F34,
+	0xAFED7FDD, 0x50B24978, 0x47FD7F1D, 0x7DFD70AD,
+	0xEF717EC1, 0x6BA47F01, 0x2D267EFD, 0x30DE5F5E,
+	0xFFFD5F5E, 0xFFEF5F5E, 0xFFDF0CA0, 0xAFED0A9E,
+	0xAFDD0C3A, 0x5F3AAFBD, 0x7FBDB082, 0x5F8247F8
+};
+
+static uint patch_2f00[] __initdata = {
+	0x3E303430, 0x34343737, 0xABF7BF9B, 0x994B4FBD,
+	0xBD599493, 0x349FFF37, 0xFB9B177D, 0xD9936956,
+	0xBBFDD697, 0xBDD2FD11, 0x31DB9BB3, 0x63139637,
+	0x93733693, 0x193137F7, 0x331737AF, 0x7BB9B999,
+	0xBB197957, 0x7FDFD3D5, 0x73B773F7, 0x37933B99,
+	0x1D115316, 0x99315315, 0x31694BF4, 0xFBDBD359,
+	0x31497353, 0x76956D69, 0x7B9D9693, 0x13131979,
+	0x79376935
+};
+
+static uint patch_2e00[] __initdata = {};
+#endif
+
+/*
+ * I2C/SPI/SMC1 relocation patch arrays.
+ */
+
+#ifdef CONFIG_I2C_SPI_SMC1_UCODE_PATCH
+
+static char patch_name[] __initdata = "I2C/SPI/SMC1";
+
+static struct patch_params patch_params __initdata = {
+	3, 0x8080, 0x808a, 0x8028, 0x802a,
+};
+
+static uint patch_2000[] __initdata = {
+	0x3fff0000, 0x3ffd0000, 0x3ffb0000, 0x3ff90000,
+	0x5f13eff8, 0x5eb5eff8, 0x5f88adf7, 0x5fefadf7,
+	0x3a9cfbc8, 0x77cae1bb, 0xf4de7fad, 0xabae9330,
+	0x4e08fdcf, 0x6e0faff8, 0x7ccf76cf, 0xfdaff9cf,
+	0xabf88dc8, 0xab5879f7, 0xb0925d8d, 0xdfd079f7,
+	0xb090e6bb, 0xe5bbe74f, 0x9e046f0f, 0x6ffb76ce,
+	0xee0cf9cf, 0x2bfbefef, 0xcfeef9cf, 0x76cead23,
+	0x90b3df99, 0x7fddd0c1, 0x4bf847fd, 0x7ccf76ce,
+	0xcfef77ca, 0x7eaf7fad, 0x7dfdf0b7, 0xef7a7fca,
+	0x77cafbc8, 0x6079e722, 0xfbc85fff, 0xdfff5fb3,
+	0xfffbfbc8, 0xf3c894a5, 0xe7c9edf9, 0x7f9a7fad,
+	0x5f36afe8, 0x5f5bffdf, 0xdf95cb9e, 0xaf7d5fc3,
+	0xafed8c1b, 0x5fc3afdd, 0x5fc5df99, 0x7efdb0b3,
+	0x5fb3fffe, 0xabae5fb3, 0xfffe5fd0, 0x600be6bb,
+	0x600b5fd0, 0xdfc827fb, 0xefdf5fca, 0xcfde3a9c,
+	0xe7c9edf9, 0xf3c87f9e, 0x54ca7fed, 0x2d3a3637,
+	0x756f7e9a, 0xf1ce37ef, 0x2e677fee, 0x10ebadf8,
+	0xefdecfea, 0xe52f7d9f, 0xe12bf1ce, 0x5f647e9a,
+	0x4df8cfea, 0x5f717d9b, 0xefeecfea, 0x5f73e522,
+	0xefde5f73, 0xcfda0b61, 0x5d8fdf61, 0xe7c9edf9,
+	0x7e9a30d5, 0x1458bfff, 0xf3c85fff, 0xdfffa7f8,
+	0x5f5bbffe, 0x7f7d10d0, 0x144d5f33, 0xbfffaf78,
+	0x5f5bbffd, 0xa7f85f33, 0xbffe77fd, 0x30bd4e08,
+	0xfdcfe5ff, 0x6e0faff8, 0x7eef7e9f, 0xfdeff1cf,
+	0x5f17abf8, 0x0d5b5f5b, 0xffef79f7, 0x309eafdd,
+	0x5f3147f8, 0x5f31afed, 0x7fdd50af, 0x497847fd,
+	0x7f9e7fed, 0x7dfd70a9, 0xef7e7ece, 0x6ba07f9e,
+	0x2d227efd, 0x30db5f5b, 0xfffd5f5b, 0xffef5f5b,
+	0xffdf0c9c, 0xafed0a9a, 0xafdd0c37, 0x5f37afbd,
+	0x7fbdb081, 0x5f8147f8, 0x3a11e710, 0xedf0ccdd,
+	0xf3186d0a, 0x7f0e5f06, 0x7fedbb38, 0x3afe7468,
+	0x7fedf4fc, 0x8ffbb951, 0xb85f77fd, 0xb0df5ddd,
+	0xdefe7fed, 0x90e1e74d, 0x6f0dcbf7, 0xe7decfed,
+	0xcb74cfed, 0xcfeddf6d, 0x91714f74, 0x5dd2deef,
+	0x9e04e7df, 0xefbb6ffb, 0xe7ef7f0e, 0x9e097fed,
+	0xebdbeffa, 0xeb54affb, 0x7fea90d7, 0x7e0cf0c3,
+	0xbffff318, 0x5fffdfff, 0xac59efea, 0x7fce1ee5,
+	0xe2ff5ee1, 0xaffbe2ff, 0x5ee3affb, 0xf9cc7d0f,
+	0xaef8770f, 0x7d0fb0c6, 0xeffbbfff, 0xcfef5ede,
+	0x7d0fbfff, 0x5ede4cf8, 0x7fddd0bf, 0x49f847fd,
+	0x7efdf0bb, 0x7fedfffd, 0x7dfdf0b7, 0xef7e7e1e,
+	0x5ede7f0e, 0x3a11e710, 0xedf0ccab, 0xfb18ad2e,
+	0x1ea9bbb8, 0x74283b7e, 0x73c2e4bb, 0x2ada4fb8,
+	0xdc21e4bb, 0xb2a1ffbf, 0x5e2c43f8, 0xfc87e1bb,
+	0xe74ffd91, 0x6f0f4fe8, 0xc7ba32e2, 0xf396efeb,
+	0x600b4f78, 0xe5bb760b, 0x53acaef8, 0x4ef88b0e,
+	0xcfef9e09, 0xabf8751f, 0xefef5bac, 0x741f4fe8,
+	0x751e760d, 0x7fdbf081, 0x741cafce, 0xefcc7fce,
+	0x751e70ac, 0x741ce7bb, 0x3372cfed, 0xafdbefeb,
+	0xe5bb760b, 0x53f2aef8, 0xafe8e7eb, 0x4bf8771e,
+	0x7e247fed, 0x4fcbe2cc, 0x7fbc30a9, 0x7b0f7a0f,
+	0x34d577fd, 0x308b5db7, 0xde553e5f, 0xaf78741f,
+	0x741f30f0, 0xcfef5e2c, 0x741f3eac, 0xafb8771e,
+	0x5e677fed, 0x0bd3e2cc, 0x741ccfec, 0xe5ca53cd,
+	0x6fcb4f74, 0x5dadde4b, 0x2ab63d38, 0x4bb3de30,
+	0x751f741c, 0x6c42effa, 0xefea7fce, 0x6ffc30be,
+	0xefec3fca, 0x30b3de2e, 0xadf85d9e, 0xaf7daefd,
+	0x5d9ede2e, 0x5d9eafdd, 0x761f10ac, 0x1da07efd,
+	0x30adfffe, 0x4908fb18, 0x5fffdfff, 0xafbb709b,
+	0x4ef85e67, 0xadf814ad, 0x7a0f70ad, 0xcfef50ad,
+	0x7a0fde30, 0x5da0afed, 0x3c12780f, 0xefef780f,
+	0xefef790f, 0xa7f85e0f, 0xffef790f, 0xefef790f,
+	0x14adde2e, 0x5d9eadfd, 0x5e2dfffb, 0xe79addfd,
+	0xeff96079, 0x607ae79a, 0xddfceff9, 0x60795dff,
+	0x607acfef, 0xefefefdf, 0xefbfef7f, 0xeeffedff,
+	0xebffe7ff, 0xafefafdf, 0xafbfaf7f, 0xaeffadff,
+	0xabffa7ff, 0x6fef6fdf, 0x6fbf6f7f, 0x6eff6dff,
+	0x6bff67ff, 0x2fef2fdf, 0x2fbf2f7f, 0x2eff2dff,
+	0x2bff27ff, 0x4e08fd1f, 0xe5ff6e0f, 0xaff87eef,
+	0x7e0ffdef, 0xf11f6079, 0xabf8f542, 0x7e0af11c,
+	0x37cfae3a, 0x7fec90be, 0xadf8efdc, 0xcfeae52f,
+	0x7d0fe12b, 0xf11c6079, 0x7e0a4df8, 0xcfea5dc4,
+	0x7d0befec, 0xcfea5dc6, 0xe522efdc, 0x5dc6cfda,
+	0x4e08fd1f, 0x6e0faff8, 0x7c1f761f, 0xfdeff91f,
+	0x6079abf8, 0x761cee24, 0xf91f2bfb, 0xefefcfec,
+	0xf91f6079, 0x761c27fb, 0xefdf5da7, 0xcfdc7fdd,
+	0xd09c4bf8, 0x47fd7c1f, 0x761ccfcf, 0x7eef7fed,
+	0x7dfdf093, 0xef7e7f1e, 0x771efb18, 0x6079e722,
+	0xe6bbe5bb, 0xae0ae5bb, 0x600bae85, 0xe2bbe2bb,
+	0xe2bbe2bb, 0xaf02e2bb, 0xe2bb2ff9, 0x6079e2bb
+};
+
+static uint patch_2f00[] __initdata = {
+	0x30303030, 0x3e3e3434, 0xabbf9b99, 0x4b4fbdbd,
+	0x59949334, 0x9fff37fb, 0x9b177dd9, 0x936956bb,
+	0xfbdd697b, 0xdd2fd113, 0x1db9f7bb, 0x36313963,
+	0x79373369, 0x3193137f, 0x7331737a, 0xf7bb9b99,
+	0x9bb19795, 0x77fdfd3d, 0x573b773f, 0x737933f7,
+	0xb991d115, 0x31699315, 0x31531694, 0xbf4fbdbd,
+	0x35931497, 0x35376956, 0xbd697b9d, 0x96931313,
+	0x19797937, 0x6935af78, 0xb9b3baa3, 0xb8788683,
+	0x368f78f7, 0x87778733, 0x3ffffb3b, 0x8e8f78b8,
+	0x1d118e13, 0xf3ff3f8b, 0x6bd8e173, 0xd1366856,
+	0x68d1687b, 0x3daf78b8, 0x3a3a3f87, 0x8f81378f,
+	0xf876f887, 0x77fd8778, 0x737de8d6, 0xbbf8bfff,
+	0xd8df87f7, 0xfd876f7b, 0x8bfff8bd, 0x8683387d,
+	0xb873d87b, 0x3b8fd7f8, 0xf7338883, 0xbb8ee1f8,
+	0xef837377, 0x3337b836, 0x817d11f8, 0x7378b878,
+	0xd3368b7d, 0xed731b7d, 0x833731f3, 0xf22f3f23
+};
+
+static uint patch_2e00[] __initdata = {
+	0x27eeeeee, 0xeeeeeeee, 0xeeeeeeee, 0xeeeeeeee,
+	0xee4bf4fb, 0xdbd259bb, 0x1979577f, 0xdfd2d573,
+	0xb773f737, 0x4b4fbdbd, 0x25b9b177, 0xd2d17376,
+	0x956bbfdd, 0x697bdd2f, 0xff9f79ff, 0xff9ff22f
+};
+#endif
+
+/*
+ *  USB SOF patch arrays.
+ */
+
+#ifdef CONFIG_USB_SOF_UCODE_PATCH
+
+static char patch_name[] __initdata = "USB SOF";
+
+static struct patch_params patch_params __initdata = {
+	9,
+};
+
+static uint patch_2000[] __initdata = {
+	0x7fff0000, 0x7ffd0000, 0x7ffb0000, 0x49f7ba5b,
+	0xba383ffb, 0xf9b8b46d, 0xe5ab4e07, 0xaf77bffe,
+	0x3f7bbf79, 0xba5bba38, 0xe7676076, 0x60750000
+};
+
+static uint patch_2f00[] __initdata = {
+	0x3030304c, 0xcab9e441, 0xa1aaf220
+};
+
+static uint patch_2e00[] __initdata = {};
+#endif
+
+/*
+ * SMC relocation patch arrays.
+ */
+
+#ifdef CONFIG_SMC_UCODE_PATCH
+
+static char patch_name[] __initdata = "SMC";
+
+static struct patch_params patch_params __initdata = {
+	2, 0x8080, 0x8088,
+};
+
+static uint patch_2000[] __initdata = {
+	0x3fff0000, 0x3ffd0000, 0x3ffb0000, 0x3ff90000,
+	0x5fefeff8, 0x5f91eff8, 0x3ff30000, 0x3ff10000,
+	0x3a11e710, 0xedf0ccb9, 0xf318ed66, 0x7f0e5fe2,
+	0x7fedbb38, 0x3afe7468, 0x7fedf4d8, 0x8ffbb92d,
+	0xb83b77fd, 0xb0bb5eb9, 0xdfda7fed, 0x90bde74d,
+	0x6f0dcbd3, 0xe7decfed, 0xcb50cfed, 0xcfeddf6d,
+	0x914d4f74, 0x5eaedfcb, 0x9ee0e7df, 0xefbb6ffb,
+	0xe7ef7f0e, 0x9ee57fed, 0xebb7effa, 0xeb30affb,
+	0x7fea90b3, 0x7e0cf09f, 0xbffff318, 0x5fffdfff,
+	0xac35efea, 0x7fce1fc1, 0xe2ff5fbd, 0xaffbe2ff,
+	0x5fbfaffb, 0xf9a87d0f, 0xaef8770f, 0x7d0fb0a2,
+	0xeffbbfff, 0xcfef5fba, 0x7d0fbfff, 0x5fba4cf8,
+	0x7fddd09b, 0x49f847fd, 0x7efdf097, 0x7fedfffd,
+	0x7dfdf093, 0xef7e7e1e, 0x5fba7f0e, 0x3a11e710,
+	0xedf0cc87, 0xfb18ad0a, 0x1f85bbb8, 0x74283b7e,
+	0x7375e4bb, 0x2ab64fb8, 0x5c7de4bb, 0x32fdffbf,
+	0x5f0843f8, 0x7ce3e1bb, 0xe74f7ded, 0x6f0f4fe8,
+	0xc7ba32be, 0x73f2efeb, 0x600b4f78, 0xe5bb760b,
+	0x5388aef8, 0x4ef80b6a, 0xcfef9ee5, 0xabf8751f,
+	0xefef5b88, 0x741f4fe8, 0x751e760d, 0x7fdb70dd,
+	0x741cafce, 0xefcc7fce, 0x751e7088, 0x741ce7bb,
+	0x334ecfed, 0xafdbefeb, 0xe5bb760b, 0x53ceaef8,
+	0xafe8e7eb, 0x4bf8771e, 0x7e007fed, 0x4fcbe2cc,
+	0x7fbc3085, 0x7b0f7a0f, 0x34b177fd, 0xb0e75e93,
+	0xdf313e3b, 0xaf78741f, 0x741f30cc, 0xcfef5f08,
+	0x741f3e88, 0xafb8771e, 0x5f437fed, 0x0bafe2cc,
+	0x741ccfec, 0xe5ca53a9, 0x6fcb4f74, 0x5e89df27,
+	0x2a923d14, 0x4b8fdf0c, 0x751f741c, 0x6c1eeffa,
+	0xefea7fce, 0x6ffc309a, 0xefec3fca, 0x308fdf0a,
+	0xadf85e7a, 0xaf7daefd, 0x5e7adf0a, 0x5e7aafdd,
+	0x761f1088, 0x1e7c7efd, 0x3089fffe, 0x4908fb18,
+	0x5fffdfff, 0xafbbf0f7, 0x4ef85f43, 0xadf81489,
+	0x7a0f7089, 0xcfef5089, 0x7a0fdf0c, 0x5e7cafed,
+	0xbc6e780f, 0xefef780f, 0xefef790f, 0xa7f85eeb,
+	0xffef790f, 0xefef790f, 0x1489df0a, 0x5e7aadfd,
+	0x5f09fffb, 0xe79aded9, 0xeff96079, 0x607ae79a,
+	0xded8eff9, 0x60795edb, 0x607acfef, 0xefefefdf,
+	0xefbfef7f, 0xeeffedff, 0xebffe7ff, 0xafefafdf,
+	0xafbfaf7f, 0xaeffadff, 0xabffa7ff, 0x6fef6fdf,
+	0x6fbf6f7f, 0x6eff6dff, 0x6bff67ff, 0x2fef2fdf,
+	0x2fbf2f7f, 0x2eff2dff, 0x2bff27ff, 0x4e08fd1f,
+	0xe5ff6e0f, 0xaff87eef, 0x7e0ffdef, 0xf11f6079,
+	0xabf8f51e, 0x7e0af11c, 0x37cfae16, 0x7fec909a,
+	0xadf8efdc, 0xcfeae52f, 0x7d0fe12b, 0xf11c6079,
+	0x7e0a4df8, 0xcfea5ea0, 0x7d0befec, 0xcfea5ea2,
+	0xe522efdc, 0x5ea2cfda, 0x4e08fd1f, 0x6e0faff8,
+	0x7c1f761f, 0xfdeff91f, 0x6079abf8, 0x761cee00,
+	0xf91f2bfb, 0xefefcfec, 0xf91f6079, 0x761c27fb,
+	0xefdf5e83, 0xcfdc7fdd, 0x50f84bf8, 0x47fd7c1f,
+	0x761ccfcf, 0x7eef7fed, 0x7dfd70ef, 0xef7e7f1e,
+	0x771efb18, 0x6079e722, 0xe6bbe5bb, 0x2e66e5bb,
+	0x600b2ee1, 0xe2bbe2bb, 0xe2bbe2bb, 0x2f5ee2bb,
+	0xe2bb2ff9, 0x6079e2bb,
+};
+
+static uint patch_2f00[] __initdata = {
+	0x30303030, 0x3e3e3030, 0xaf79b9b3, 0xbaa3b979,
+	0x9693369f, 0x79f79777, 0x97333fff, 0xfb3b9e9f,
+	0x79b91d11, 0x9e13f3ff, 0x3f9b6bd9, 0xe173d136,
+	0x695669d1, 0x697b3daf, 0x79b93a3a, 0x3f979f91,
+	0x379ff976, 0xf99777fd, 0x9779737d, 0xe9d6bbf9,
+	0xbfffd9df, 0x97f7fd97, 0x6f7b9bff, 0xf9bd9683,
+	0x397db973, 0xd97b3b9f, 0xd7f9f733, 0x9993bb9e,
+	0xe1f9ef93, 0x73773337, 0xb936917d, 0x11f87379,
+	0xb979d336, 0x8b7ded73, 0x1b7d9337, 0x31f3f22f,
+	0x3f2327ee, 0xeeeeeeee, 0xeeeeeeee, 0xeeeeeeee,
+	0xeeeeee4b, 0xf4fbdbd2, 0x58bb1878, 0x577fdfd2,
+	0xd573b773, 0xf7374b4f, 0xbdbd25b8, 0xb177d2d1,
+	0x7376856b, 0xbfdd687b, 0xdd2fff8f, 0x78ffff8f,
+	0xf22f0000,
+};
+
+static uint patch_2e00[] __initdata = {};
+#endif
+
+static void __init cpm_write_patch(cpm8xx_t *cp, int offset, uint *patch, int len)
+{
+	if (!len)
+		return;
+	memcpy_toio(cp->cp_dpmem + offset, patch, len);
+}
+
+void __init cpm_load_patch(cpm8xx_t *cp)
+{
+	out_be16(&cp->cp_rccr, 0);
+
+	cpm_write_patch(cp, 0, patch_2000, sizeof(patch_2000));
+	cpm_write_patch(cp, 0xf00, patch_2f00, sizeof(patch_2f00));
+	cpm_write_patch(cp, 0xe00, patch_2e00, sizeof(patch_2e00));
+
+	if (IS_ENABLED(CONFIG_I2C_SPI_UCODE_PATCH) ||
+	    IS_ENABLED(CONFIG_I2C_SPI_SMC1_UCODE_PATCH)) {
+		u16 rpbase = 0x500;
+		iic_t *iip;
+		struct spi_pram *spp;
+
+		iip = (iic_t *)&cp->cp_dparam[PROFF_IIC];
+		out_be16(&iip->iic_rpbase, rpbase);
+
+		/* Put SPI above the IIC, also 32-byte aligned. */
+		spp = (struct spi_pram *)&cp->cp_dparam[PROFF_SPI];
+		out_be16(&spp->rpbase, (rpbase + sizeof(iic_t) + 31) & ~31);
+
+		if (IS_ENABLED(CONFIG_I2C_SPI_SMC1_UCODE_PATCH)) {
+			smc_uart_t *smp;
+
+			smp = (smc_uart_t *)&cp->cp_dparam[PROFF_SMC1];
+			out_be16(&smp->smc_rpbase, 0x1FC0);
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_SMC_UCODE_PATCH)) {
+		smc_uart_t *smp;
+
+		smp = (smc_uart_t *)&cp->cp_dparam[PROFF_SMC1];
+		out_be16(&smp->smc_rpbase, 0x1ec0);
+		smp = (smc_uart_t *)&cp->cp_dparam[PROFF_SMC2];
+		out_be16(&smp->smc_rpbase, 0x1fc0);
+	}
+
+	out_be16(&cp->cp_cpmcr1, patch_params.cpmcr1);
+	out_be16(&cp->cp_cpmcr2, patch_params.cpmcr2);
+	out_be16(&cp->cp_cpmcr3, patch_params.cpmcr3);
+	out_be16(&cp->cp_cpmcr4, patch_params.cpmcr4);
+
+	out_be16(&cp->cp_rccr, patch_params.rccr);
+
+	pr_info("%s microcode patch installed\n", patch_name);
+}
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index f3fb79fccc72..d82e3664ffdf 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -197,7 +197,8 @@ endmenu
 
 config PPC601_SYNC_FIX
 	bool "Workarounds for PPC601 bugs"
-	depends on PPC_BOOK3S_32 && PPC_PMAC
+	depends on PPC_BOOK3S_601 && PPC_PMAC
+	default y
 	help
 	  Some versions of the PPC601 (the first PowerPC chip) have bugs which
 	  mean that extra synchronization instructions are required near
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 2794235e9d3e..12543e53fa96 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -6,6 +6,9 @@ config PPC64
 	  This option selects whether a 32-bit or a 64-bit kernel
 	  will be built.
 
+config PPC_BOOK3S_32
+	bool
+
 menu "Processor support"
 choice
 	prompt "Processor Type"
@@ -21,13 +24,20 @@ choice
 
 	  If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx.
 
-config PPC_BOOK3S_32
-	bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx"
+config PPC_BOOK3S_6xx
+	bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601"
+	select PPC_BOOK3S_32
 	select PPC_FPU
 	select PPC_HAVE_PMU_SUPPORT
 	select PPC_HAVE_KUEP
 	select PPC_HAVE_KUAP
 
+config PPC_BOOK3S_601
+	bool "PowerPC 601"
+	select PPC_BOOK3S_32
+	select PPC_FPU
+	select PPC_HAVE_KUAP
+
 config PPC_85xx
 	bool "Freescale 85xx"
 	select E500
@@ -330,7 +340,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 
 config PPC_RADIX_MMU
 	bool "Radix MMU Support"
-	depends on PPC_BOOK3S_64 && HUGETLB_PAGE
+	depends on PPC_BOOK3S_64
 	select ARCH_HAS_GIGANTIC_PAGE
 	select PPC_HAVE_KUEP
 	select PPC_HAVE_KUAP
@@ -450,8 +460,10 @@ config NOT_COHERENT_CACHE
 	depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
 		GAMECUBE_COMMON || AMIGAONE
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select DMA_DIRECT_REMAP
 	default n if PPC_47x
 	default y
 
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 16dfee29aa41..ca9ffc1c8685 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_size = size >> window->table.it_page_shift;
 	window->table.it_ops = &cell_iommu_ops;
 
-	iommu_init_table(&window->table, iommu->nid);
+	iommu_init_table(&window->table, iommu->nid, 0, 0);
 
 	pr_debug("\tioid      %d\n", window->ioid);
 	pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 6dfd2cb1bce7..24adbe3c605c 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -31,22 +31,21 @@ static void spufs_handle_event(struct spu_context *ctx,
 
 	switch (type) {
 	case SPE_EVENT_INVALID_DMA:
-		force_sig_fault(SIGBUS, BUS_OBJERR, NULL, current);
+		force_sig_fault(SIGBUS, BUS_OBJERR, NULL);
 		break;
 	case SPE_EVENT_SPE_DATA_STORAGE:
 		ctx->ops->restart_dma(ctx);
-		force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea,
-				current);
+		force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea);
 		break;
 	case SPE_EVENT_DMA_ALIGNMENT:
 		/* DAR isn't set for an alignment fault :( */
-		force_sig_fault(SIGBUS, BUS_ADRALN, NULL, current);
+		force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
 		break;
 	case SPE_EVENT_SPE_ERROR:
 		force_sig_fault(
 			SIGILL, ILL_ILLOPC,
 			(void __user *)(unsigned long)
-			ctx->ops->npc_read(ctx) - 4, current);
+			ctx->ops->npc_read(ctx) - 4);
 		break;
 	}
 }
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index d40253a18b1c..c0f950a3f4e1 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -446,7 +446,7 @@ static const struct file_operations spufs_cntl_fops = {
 	.release = spufs_cntl_release,
 	.read = simple_attr_read,
 	.write = simple_attr_write,
-	.llseek	= generic_file_llseek,
+	.llseek	= no_llseek,
 	.mmap = spufs_cntl_mmap,
 };
 
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 065ff14b76e1..1d93e55a2de1 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -10,6 +10,8 @@
 
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/fsnotify.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
@@ -20,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
-#include <linux/parser.h>
 
 #include <asm/prom.h>
 #include <asm/spu.h>
@@ -30,7 +31,7 @@
 #include "spufs.h"
 
 struct spufs_sb_info {
-	int debug;
+	bool debug;
 };
 
 static struct kmem_cache *spufs_inode_cache;
@@ -574,16 +575,27 @@ long spufs_create(struct path *path, struct dentry *dentry,
 }
 
 /* File system initialization */
+struct spufs_fs_context {
+	kuid_t	uid;
+	kgid_t	gid;
+	umode_t	mode;
+};
+
 enum {
-	Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err,
+	Opt_uid, Opt_gid, Opt_mode, Opt_debug,
+};
+
+static const struct fs_parameter_spec spufs_param_specs[] = {
+	fsparam_u32	("gid",				Opt_gid),
+	fsparam_u32oct	("mode",			Opt_mode),
+	fsparam_u32	("uid",				Opt_uid),
+	fsparam_flag	("debug",			Opt_debug),
+	{}
 };
 
-static const match_table_t spufs_tokens = {
-	{ Opt_uid,   "uid=%d" },
-	{ Opt_gid,   "gid=%d" },
-	{ Opt_mode,  "mode=%o" },
-	{ Opt_debug, "debug" },
-	{ Opt_err,    NULL  },
+static const struct fs_parameter_description spufs_fs_parameters = {
+	.name		= "spufs",
+	.specs		= spufs_param_specs,
 };
 
 static int spufs_show_options(struct seq_file *m, struct dentry *root)
@@ -604,47 +616,41 @@ static int spufs_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
-static int
-spufs_parse_options(struct super_block *sb, char *options, struct inode *root)
-{
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token, option;
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, spufs_tokens, args);
-		switch (token) {
-		case Opt_uid:
-			if (match_int(&args[0], &option))
-				return 0;
-			root->i_uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(root->i_uid))
-				return 0;
-			break;
-		case Opt_gid:
-			if (match_int(&args[0], &option))
-				return 0;
-			root->i_gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(root->i_gid))
-				return 0;
-			break;
-		case Opt_mode:
-			if (match_octal(&args[0], &option))
-				return 0;
-			root->i_mode = option | S_IFDIR;
-			break;
-		case Opt_debug:
-			spufs_get_sb_info(sb)->debug = 1;
-			break;
-		default:
-			return 0;
-		}
+static int spufs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct spufs_fs_context *ctx = fc->fs_private;
+	struct spufs_sb_info *sbi = fc->s_fs_info;
+	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
+	int opt;
+
+	opt = fs_parse(fc, &spufs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_uid:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			return invalf(fc, "Unknown uid");
+		ctx->uid = uid;
+		break;
+	case Opt_gid:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			return invalf(fc, "Unknown gid");
+		ctx->gid = gid;
+		break;
+	case Opt_mode:
+		ctx->mode = result.uint_32 & S_IALLUGO;
+		break;
+	case Opt_debug:
+		sbi->debug = true;
+		break;
 	}
-	return 1;
+
+	return 0;
 }
 
 static void spufs_exit_isolated_loader(void)
@@ -678,79 +684,98 @@ spufs_init_isolated_loader(void)
 	printk(KERN_INFO "spufs: SPU isolation mode enabled\n");
 }
 
-static int
-spufs_create_root(struct super_block *sb, void *data)
+static int spufs_create_root(struct super_block *sb, struct fs_context *fc)
 {
+	struct spufs_fs_context *ctx = fc->fs_private;
 	struct inode *inode;
-	int ret;
 
-	ret = -ENODEV;
 	if (!spu_management_ops)
-		goto out;
+		return -ENODEV;
 
-	ret = -ENOMEM;
-	inode = spufs_new_inode(sb, S_IFDIR | 0775);
+	inode = spufs_new_inode(sb, S_IFDIR | ctx->mode);
 	if (!inode)
-		goto out;
+		return -ENOMEM;
 
+	inode->i_uid = ctx->uid;
+	inode->i_gid = ctx->gid;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	SPUFS_I(inode)->i_ctx = NULL;
 	inc_nlink(inode);
 
-	ret = -EINVAL;
-	if (!spufs_parse_options(sb, data, inode))
-		goto out_iput;
-
-	ret = -ENOMEM;
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
-		goto out;
-
+		return -ENOMEM;
 	return 0;
-out_iput:
-	iput(inode);
-out:
-	return ret;
 }
 
-static int
-spufs_fill_super(struct super_block *sb, void *data, int silent)
-{
-	struct spufs_sb_info *info;
-	static const struct super_operations s_ops = {
-		.alloc_inode = spufs_alloc_inode,
-		.free_inode = spufs_free_inode,
-		.statfs = simple_statfs,
-		.evict_inode = spufs_evict_inode,
-		.show_options = spufs_show_options,
-	};
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
+static const struct super_operations spufs_ops = {
+	.alloc_inode	= spufs_alloc_inode,
+	.free_inode	= spufs_free_inode,
+	.statfs		= simple_statfs,
+	.evict_inode	= spufs_evict_inode,
+	.show_options	= spufs_show_options,
+};
 
+static int spufs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = SPUFS_MAGIC;
-	sb->s_op = &s_ops;
-	sb->s_fs_info = info;
+	sb->s_op = &spufs_ops;
 
-	return spufs_create_root(sb, data);
+	return spufs_create_root(sb, fc);
+}
+
+static int spufs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, spufs_fill_super);
 }
 
-static struct dentry *
-spufs_mount(struct file_system_type *fstype, int flags,
-		const char *name, void *data)
+static void spufs_free_fc(struct fs_context *fc)
 {
-	return mount_single(fstype, flags, data, spufs_fill_super);
+	kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations spufs_context_ops = {
+	.free		= spufs_free_fc,
+	.parse_param	= spufs_parse_param,
+	.get_tree	= spufs_get_tree,
+};
+
+static int spufs_init_fs_context(struct fs_context *fc)
+{
+	struct spufs_fs_context *ctx;
+	struct spufs_sb_info *sbi;
+
+	ctx = kzalloc(sizeof(struct spufs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		goto nomem;
+
+	sbi = kzalloc(sizeof(struct spufs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		goto nomem_ctx;
+
+	ctx->uid = current_uid();
+	ctx->gid = current_gid();
+	ctx->mode = 0755;
+
+	fc->s_fs_info = sbi;
+	fc->ops = &spufs_context_ops;
+	return 0;
+
+nomem_ctx:
+	kfree(ctx);
+nomem:
+	return -ENOMEM;
 }
 
 static struct file_system_type spufs_type = {
 	.owner = THIS_MODULE,
 	.name = "spufs",
-	.mount = spufs_mount,
+	.init_fs_context = spufs_init_fs_context,
+	.parameters	= &spufs_fs_parameters,
 	.kill_sb = kill_litter_super,
 };
 MODULE_ALIAS_FS("spufs");
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index 07f82d7395ff..3f2380f40f99 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -443,7 +443,7 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 
 	else if (unlikely((status & SPU_STATUS_STOPPED_BY_STOP)
 	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff)) {
-		force_sig(SIGTRAP, current);
+		force_sig(SIGTRAP);
 		ret = -ERESTARTSYS;
 	}
 
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e56b553de27b..f18d5067cd0f 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	 * runqueue. The context will be rescheduled on the proper node
 	 * if it is timesliced or preempted.
 	 */
-	cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
+	cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
 
 	/* Save the current cpu id for spu interrupt routing. */
 	ctx->last_ran = raw_smp_processor_id();
diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig
index 08d530a2a8b1..86ae210bee9a 100644
--- a/arch/powerpc/platforms/maple/Kconfig
+++ b/arch/powerpc/platforms/maple/Kconfig
@@ -14,5 +14,5 @@ config PPC_MAPLE
 	select MMIO_NVRAM
 	select ATA_NONSTANDARD if ATA
 	help
-          This option enables support for the Maple 970FX Evaluation Board.
+	  This option enables support for the Maple 970FX Evaluation Board.
 	  For more information, refer to <http://www.970eval.com>
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 77fee09104f8..b500a6e47e6b 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void)
 	 */
 	iommu_table_iobmap.it_blocksize = 4;
 	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
-	iommu_init_table(&iommu_table_iobmap, 0);
+	iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
 	pr_debug(" <- %s\n", __func__);
 }
 
diff --git a/arch/powerpc/platforms/powermac/sleep.S b/arch/powerpc/platforms/powermac/sleep.S
index 6bbcbec97712..bd6085b470b7 100644
--- a/arch/powerpc/platforms/powermac/sleep.S
+++ b/arch/powerpc/platforms/powermac/sleep.S
@@ -33,10 +33,18 @@
 #define SL_IBAT2	0x48
 #define SL_DBAT3	0x50
 #define SL_IBAT3	0x58
-#define SL_TB		0x60
-#define SL_R2		0x68
-#define SL_CR		0x6c
-#define SL_R12		0x70	/* r12 to r31 */
+#define SL_DBAT4	0x60
+#define SL_IBAT4	0x68
+#define SL_DBAT5	0x70
+#define SL_IBAT5	0x78
+#define SL_DBAT6	0x80
+#define SL_IBAT6	0x88
+#define SL_DBAT7	0x90
+#define SL_IBAT7	0x98
+#define SL_TB		0xa0
+#define SL_R2		0xa8
+#define SL_CR		0xac
+#define SL_R12		0xb0	/* r12 to r31 */
 #define SL_SIZE		(SL_R12 + 80)
 
 	.section .text
@@ -121,6 +129,41 @@ _GLOBAL(low_sleep_handler)
 	mfibatl	r4,3
 	stw	r4,SL_IBAT3+4(r1)
 
+BEGIN_MMU_FTR_SECTION
+	mfspr	r4,SPRN_DBAT4U
+	stw	r4,SL_DBAT4(r1)
+	mfspr	r4,SPRN_DBAT4L
+	stw	r4,SL_DBAT4+4(r1)
+	mfspr	r4,SPRN_DBAT5U
+	stw	r4,SL_DBAT5(r1)
+	mfspr	r4,SPRN_DBAT5L
+	stw	r4,SL_DBAT5+4(r1)
+	mfspr	r4,SPRN_DBAT6U
+	stw	r4,SL_DBAT6(r1)
+	mfspr	r4,SPRN_DBAT6L
+	stw	r4,SL_DBAT6+4(r1)
+	mfspr	r4,SPRN_DBAT7U
+	stw	r4,SL_DBAT7(r1)
+	mfspr	r4,SPRN_DBAT7L
+	stw	r4,SL_DBAT7+4(r1)
+	mfspr	r4,SPRN_IBAT4U
+	stw	r4,SL_IBAT4(r1)
+	mfspr	r4,SPRN_IBAT4L
+	stw	r4,SL_IBAT4+4(r1)
+	mfspr	r4,SPRN_IBAT5U
+	stw	r4,SL_IBAT5(r1)
+	mfspr	r4,SPRN_IBAT5L
+	stw	r4,SL_IBAT5+4(r1)
+	mfspr	r4,SPRN_IBAT6U
+	stw	r4,SL_IBAT6(r1)
+	mfspr	r4,SPRN_IBAT6L
+	stw	r4,SL_IBAT6+4(r1)
+	mfspr	r4,SPRN_IBAT7U
+	stw	r4,SL_IBAT7(r1)
+	mfspr	r4,SPRN_IBAT7L
+	stw	r4,SL_IBAT7+4(r1)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
+
 	/* Backup various CPU config stuffs */
 	bl	__save_cpu_setup
 
@@ -321,22 +364,37 @@ grackle_wake_up:
 	mtibatl	3,r4
 
 BEGIN_MMU_FTR_SECTION
-	li	r4,0
+	lwz	r4,SL_DBAT4(r1)
 	mtspr	SPRN_DBAT4U,r4
+	lwz	r4,SL_DBAT4+4(r1)
 	mtspr	SPRN_DBAT4L,r4
+	lwz	r4,SL_DBAT5(r1)
 	mtspr	SPRN_DBAT5U,r4
+	lwz	r4,SL_DBAT5+4(r1)
 	mtspr	SPRN_DBAT5L,r4
+	lwz	r4,SL_DBAT6(r1)
 	mtspr	SPRN_DBAT6U,r4
+	lwz	r4,SL_DBAT6+4(r1)
 	mtspr	SPRN_DBAT6L,r4
+	lwz	r4,SL_DBAT7(r1)
 	mtspr	SPRN_DBAT7U,r4
+	lwz	r4,SL_DBAT7+4(r1)
 	mtspr	SPRN_DBAT7L,r4
+	lwz	r4,SL_IBAT4(r1)
 	mtspr	SPRN_IBAT4U,r4
+	lwz	r4,SL_IBAT4+4(r1)
 	mtspr	SPRN_IBAT4L,r4
+	lwz	r4,SL_IBAT5(r1)
 	mtspr	SPRN_IBAT5U,r4
+	lwz	r4,SL_IBAT5+4(r1)
 	mtspr	SPRN_IBAT5L,r4
+	lwz	r4,SL_IBAT6(r1)
 	mtspr	SPRN_IBAT6U,r4
+	lwz	r4,SL_IBAT6+4(r1)
 	mtspr	SPRN_IBAT6L,r4
+	lwz	r4,SL_IBAT7(r1)
 	mtspr	SPRN_IBAT7U,r4
+	lwz	r4,SL_IBAT7+4(r1)
 	mtspr	SPRN_IBAT7L,r4
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
 
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 850eee860cf2..938803eab0ad 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -12,7 +12,6 @@ config PPC_POWERNV
 	select EPAPR_BOOT
 	select PPC_INDIRECT_PIO
 	select PPC_UDBG_16550
-	select PPC_SCOM
 	select ARCH_RANDOM
 	select CPU_FREQ
 	select PPC_DOORBELL
@@ -47,3 +46,7 @@ config PPC_VAS
 	  VAS adapters are found in POWER9 based systems.
 
 	  If unsure, say N.
+
+config SCOM_DEBUGFS
+	bool "Expose SCOM controllers via debugfs"
+	depends on DEBUG_FS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index da2e99efbd04..a3ac9646119d 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,15 +4,19 @@ obj-y			+= idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+obj-y			+= ultravisor.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_FA_DUMP)	+= opal-fadump.o
+obj-$(CONFIG_PRESERVE_FA_DUMP)	+= opal-fadump.o
+obj-$(CONFIG_OPAL_CORE)	+= opal-core.o
 obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE)	+= pci-cxl.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
-obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
 obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE)	+= memtrace.o
 obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o vas-debug.o
 obj-$(CONFIG_OCXL_BASE)	+= ocxl.o
+obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 9ade4489f415..6bc24a47e9ef 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * The file intends to implement the platform dependent EEH operations on
- * powernv platform. Actually, the powernv was created in order to fully
- * hypervisor support.
+ * PowerNV Platform dependent EEH operations
  *
  * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
  */
@@ -36,6 +34,7 @@
 
 #include "powernv.h"
 #include "pci.h"
+#include "../../../../drivers/pci/pci.h"
 
 static int eeh_event_irq = -EINVAL;
 
@@ -43,13 +42,10 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 
-	if (!pdev->is_virtfn)
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
 		return;
 
-	/*
-	 * The following operations will fail if VF's sysfs files
-	 * aren't created or its resources aren't finalized.
-	 */
+	dev_dbg(&pdev->dev, "EEH: Setting up device\n");
 	eeh_add_device_early(pdn);
 	eeh_add_device_late(pdev);
 	eeh_sysfs_add_device(pdev);
@@ -201,6 +197,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
 
 #endif /* CONFIG_DEBUG_FS */
 
+void pnv_eeh_enable_phbs(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		/*
+		 * If EEH is enabled, we're going to rely on that.
+		 * Otherwise, we restore to conventional mechanism
+		 * to clear frozen PE during PCI config access.
+		 */
+		if (eeh_enabled())
+			phb->flags |= PNV_PHB_FLAG_EEH;
+		else
+			phb->flags &= ~PNV_PHB_FLAG_EEH;
+	}
+}
+
 /**
  * pnv_eeh_post_init - EEH platform dependent post initialization
  *
@@ -215,9 +230,7 @@ int pnv_eeh_post_init(void)
 	struct pnv_phb *phb;
 	int ret = 0;
 
-	/* Probe devices & build address cache */
-	eeh_probe_devices();
-	eeh_addr_cache_build();
+	eeh_show_enabled();
 
 	/* Register OPAL event notifier */
 	eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
@@ -239,19 +252,11 @@ int pnv_eeh_post_init(void)
 	if (!eeh_enabled())
 		disable_irq(eeh_event_irq);
 
+	pnv_eeh_enable_phbs();
+
 	list_for_each_entry(hose, &hose_list, list_node) {
 		phb = hose->private_data;
 
-		/*
-		 * If EEH is enabled, we're going to rely on that.
-		 * Otherwise, we restore to conventional mechanism
-		 * to clear frozen PE during PCI config access.
-		 */
-		if (eeh_enabled())
-			phb->flags |= PNV_PHB_FLAG_EEH;
-		else
-			phb->flags &= ~PNV_PHB_FLAG_EEH;
-
 		/* Create debugfs entries */
 #ifdef CONFIG_DEBUG_FS
 		if (phb->has_dbgfs || !phb->dbgfs)
@@ -379,6 +384,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
 		return NULL;
 
+	eeh_edev_dbg(edev, "Probing device\n");
+
 	/* Initialize eeh device */
 	edev->class_code = pdn->class_code;
 	edev->mode	&= 0xFFFFFF00;
@@ -404,9 +411,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	/* Create PE */
 	ret = eeh_add_to_parent_pe(edev);
 	if (ret) {
-		pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n",
-			__func__, hose->global_number, pdn->busno,
-			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
+		eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
 		return NULL;
 	}
 
@@ -455,11 +460,17 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	 * Enable EEH explicitly so that we will do EEH check
 	 * while accessing I/O stuff
 	 */
-	eeh_add_flag(EEH_ENABLED);
+	if (!eeh_has_flag(EEH_ENABLED)) {
+		enable_irq(eeh_event_irq);
+		pnv_eeh_enable_phbs();
+		eeh_add_flag(EEH_ENABLED);
+	}
 
 	/* Save memory bars */
 	eeh_save_bars(edev);
 
+	eeh_edev_dbg(edev, "EEH enabled on device\n");
+
 	return NULL;
 }
 
@@ -839,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
 	int aer = edev ? edev->aer_cap : 0;
 	u32 ctrl;
 
-	pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+	pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
 		 __func__, pci_domain_nr(dev->bus),
 		 dev->bus->number, option);
 
@@ -897,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
 	if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
 		return __pnv_eeh_bridge_reset(pdev, option);
 
+	pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
+		 __func__, pci_domain_nr(pdev->bus),
+		 pdev->bus->number, option);
+
 	switch (option) {
 	case EEH_RESET_FUNDAMENTAL:
 		scope = OPAL_RESET_PCI_FUNDAMENTAL;
@@ -1115,17 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 		return -EIO;
 	}
 
+	if (pci_is_root_bus(bus))
+		return pnv_eeh_root_reset(hose, option);
+
 	/*
-	 * If dealing with the root bus (or the bus underneath the
-	 * root port), we reset the bus underneath the root port.
+	 * For hot resets try use the generic PCI error recovery reset
+	 * functions. These correctly handles the case where the secondary
+	 * bus is behind a hotplug slot and it will use the slot provided
+	 * reset methods to prevent spurious hotplug events during the reset.
 	 *
-	 * The cxl driver depends on this behaviour for bi-modal card
-	 * switching.
+	 * Fundemental resets need to be handled internally to EEH since the
+	 * PCI core doesn't really have a concept of a fundemental reset,
+	 * mainly because there's no standard way to generate one. Only a
+	 * few devices require an FRESET so it should be fine.
 	 */
-	if (pci_is_root_bus(bus) ||
-	    pci_is_root_bus(bus->parent))
-		return pnv_eeh_root_reset(hose, option);
+	if (option != EEH_RESET_FUNDAMENTAL) {
+		/*
+		 * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
+		 *     de-assert step. It's like the OPAL reset API was
+		 *     poorly designed or something...
+		 */
+		if (option == EEH_RESET_DEACTIVATE)
+			return 0;
 
+		rc = pci_bus_error_reset(bus->self);
+		if (!rc)
+			return 0;
+	}
+
+	/* otherwise, use the generic bridge reset. this might call into FW */
+	if (pci_is_root_bus(bus->parent))
+		return pnv_eeh_root_reset(hose, option);
 	return pnv_eeh_bridge_reset(bus->self, option);
 }
 
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 2f4479b94ac3..78599bca66c2 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 		sprs.ptcr	= mfspr(SPRN_PTCR);
 		sprs.rpr	= mfspr(SPRN_RPR);
 		sprs.tscr	= mfspr(SPRN_TSCR);
-		sprs.ldbar	= mfspr(SPRN_LDBAR);
+		if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+			sprs.ldbar = mfspr(SPRN_LDBAR);
 
 		sprs_saved = true;
 
@@ -716,7 +717,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 		 * to reload MMCR0 (see mmcr0 comment above).
 		 */
 		if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
-			asm volatile(PPC_INVALIDATE_ERAT);
+			asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT);
 			mtspr(SPRN_MMCR0, mmcr0);
 		}
 
@@ -758,7 +759,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 	mtspr(SPRN_PTCR,	sprs.ptcr);
 	mtspr(SPRN_RPR,		sprs.rpr);
 	mtspr(SPRN_TSCR,	sprs.tscr);
-	mtspr(SPRN_LDBAR,	sprs.ldbar);
 
 	if (pls >= pnv_first_tb_loss_level) {
 		/* TB loss */
@@ -790,6 +790,8 @@ core_woken:
 	mtspr(SPRN_MMCR0,	sprs.mmcr0);
 	mtspr(SPRN_MMCR1,	sprs.mmcr1);
 	mtspr(SPRN_MMCR2,	sprs.mmcr2);
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		mtspr(SPRN_LDBAR, sprs.ldbar);
 
 	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
 
@@ -1155,10 +1157,10 @@ static void __init pnv_power9_idle_init(void)
 			pnv_deepest_stop_psscr_mask);
 	}
 
-	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%lld\n",
+	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n",
 		pnv_first_spr_loss_level);
 
-	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%lld\n",
+	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n",
 		pnv_first_tb_loss_level);
 }
 
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 5e53c1392d3b..eb2e75dac369 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg)
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
-	u64 end_pfn = start_pfn + nr_pages - 1;
+	const unsigned long start = PFN_PHYS(start_pfn);
+	const unsigned long size = PFN_PHYS(nr_pages);
 
-	if (walk_memory_range(start_pfn, end_pfn, NULL,
-	    check_memblock_online))
+	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
 		return false;
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+			   change_memblock_state);
 
 	if (offline_pages(start_pfn, nr_pages)) {
-		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
-				  change_memblock_state);
+		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+				   change_memblock_state);
 		return false;
 	}
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+			   change_memblock_state);
 
 
 	return true;
@@ -242,9 +242,8 @@ static int memtrace_online(void)
 		 */
 		if (!memhp_auto_online) {
 			lock_device_hotplug();
-			walk_memory_range(PFN_DOWN(ent->start),
-					  PFN_UP(ent->start + ent->size - 1),
-					  NULL, online_mem_block);
+			walk_memory_blocks(ent->start, ent->size, NULL,
+					   online_mem_block);
 			unlock_device_hotplug();
 		}
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index c321fdbc2200..b95b9e3c4c98 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -19,18 +19,25 @@
 
 #include "pci.h"
 
-/*
- * spinlock to protect initialisation of an npu_context for a particular
- * mm_struct.
- */
-static DEFINE_SPINLOCK(npu_context_lock);
-
 static struct pci_dev *get_pci_dev(struct device_node *dn)
 {
 	struct pci_dn *pdn = PCI_DN(dn);
+	struct pci_dev *pdev;
 
-	return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
 					   pdn->busno, pdn->devfn);
+
+	/*
+	 * pci_get_domain_bus_and_slot() increased the reference count of
+	 * the PCI device, but callers don't need that actually as the PE
+	 * already holds a reference to the device. Since callers aren't
+	 * aware of the reference count change, call pci_dev_put() now to
+	 * avoid leaks.
+	 */
+	if (pdev)
+		pci_dev_put(pdev);
+
+	return pdev;
 }
 
 /* Given a NPU device get the associated PCI device. */
@@ -82,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
 }
 EXPORT_SYMBOL(pnv_pci_get_npu_dev);
 
+#ifdef CONFIG_IOMMU_API
 /*
  * Returns the PE assoicated with the PCI device of the given
  * NPU. Returns the linked pci device if pci_dev != NULL.
@@ -185,106 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
 	return 0;
 }
 
-/*
- * Enables 32 bit DMA on NPU.
- */
-static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
-{
-	struct pci_dev *gpdev;
-	struct pnv_ioda_pe *gpe;
-	int64_t rc;
-
-	/*
-	 * Find the assoicated PCI devices and get the dma window
-	 * information from there.
-	 */
-	if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
-		return;
-
-	gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
-	if (!gpe)
-		return;
-
-	rc = pnv_npu_set_window(&npe->table_group, 0,
-			gpe->table_group.tables[0]);
-
-	/*
-	 * NVLink devices use the same TCE table configuration as
-	 * their parent device so drivers shouldn't be doing DMA
-	 * operations directly on these devices.
-	 */
-	set_dma_ops(&npe->pdev->dev, &dma_dummy_ops);
-}
-
-/*
- * Enables bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicitly enabled or
- * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
- * active at the same time.
- */
-static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
-{
-	struct pnv_phb *phb = npe->phb;
-	int64_t rc = 0;
-	phys_addr_t top = memblock_end_of_DRAM();
-
-	if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
-		return -EINVAL;
-
-	rc = pnv_npu_unset_window(&npe->table_group, 0);
-	if (rc != OPAL_SUCCESS)
-		return rc;
-
-	/* Enable the bypass window */
-
-	top = roundup_pow_of_two(top);
-	dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
-			npe->pe_number);
-	rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
-			npe->pe_number, npe->pe_number,
-			0 /* bypass base */, top);
-
-	if (rc == OPAL_SUCCESS)
-		pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-	return rc;
-}
-
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
-{
-	int i;
-	struct pnv_phb *phb;
-	struct pci_dn *pdn;
-	struct pnv_ioda_pe *npe;
-	struct pci_dev *npdev;
-
-	for (i = 0; ; ++i) {
-		npdev = pnv_pci_get_npu_dev(gpdev, i);
-
-		if (!npdev)
-			break;
-
-		pdn = pci_get_pdn(npdev);
-		if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-			return;
-
-		phb = pci_bus_to_host(npdev->bus)->private_data;
-
-		/* We only do bypass if it's enabled on the linked device */
-		npe = &phb->ioda.pe_array[pdn->pe_number];
-
-		if (bypass) {
-			dev_info(&npdev->dev,
-					"Using 64-bit DMA iommu bypass\n");
-			pnv_npu_dma_set_bypass(npe);
-		} else {
-			dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
-			pnv_npu_dma_set_32(npe);
-		}
-	}
-}
-
-#ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
 static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
 {
@@ -359,15 +267,6 @@ struct npu_comp {
 /* An NPU descriptor, valid for POWER9 only */
 struct npu {
 	int index;
-	__be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
-	unsigned int mmio_atsd_count;
-
-	/* Bitmask for MMIO register usage */
-	unsigned long mmio_atsd_usage;
-
-	/* Do we need to explicitly flush the nest mmu? */
-	bool nmmu_flush;
-
 	struct npu_comp npucomp;
 };
 
@@ -624,534 +523,8 @@ struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
 }
 #endif /* CONFIG_IOMMU_API */
 
-/* Maximum number of nvlinks per npu */
-#define NV_MAX_LINKS 6
-
-/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
-static int max_npu2_index;
-
-struct npu_context {
-	struct mm_struct *mm;
-	struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
-	struct mmu_notifier mn;
-	struct kref kref;
-	bool nmmu_flush;
-
-	/* Callback to stop translation requests on a given GPU */
-	void (*release_cb)(struct npu_context *context, void *priv);
-
-	/*
-	 * Private pointer passed to the above callback for usage by
-	 * device drivers.
-	 */
-	void *priv;
-};
-
-struct mmio_atsd_reg {
-	struct npu *npu;
-	int reg;
-};
-
-/*
- * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
- * if none are available.
- */
-static int get_mmio_atsd_reg(struct npu *npu)
-{
-	int i;
-
-	for (i = 0; i < npu->mmio_atsd_count; i++) {
-		if (!test_bit(i, &npu->mmio_atsd_usage))
-			if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
-				return i;
-	}
-
-	return -ENOSPC;
-}
-
-static void put_mmio_atsd_reg(struct npu *npu, int reg)
-{
-	clear_bit_unlock(reg, &npu->mmio_atsd_usage);
-}
-
-/* MMIO ATSD register offsets */
-#define XTS_ATSD_LAUNCH 0
-#define XTS_ATSD_AVA    1
-#define XTS_ATSD_STAT   2
-
-static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
-{
-	unsigned long launch = 0;
-
-	if (psize == MMU_PAGE_COUNT) {
-		/* IS set to invalidate entire matching PID */
-		launch |= PPC_BIT(12);
-	} else {
-		/* AP set to invalidate region of psize */
-		launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
-	}
-
-	/* PRS set to process-scoped */
-	launch |= PPC_BIT(13);
-
-	/* PID */
-	launch |= pid << PPC_BITLSHIFT(38);
-
-	/* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
-
-	return launch;
-}
-
-static void mmio_atsd_regs_write(struct mmio_atsd_reg
-			mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
-			unsigned long val)
-{
-	struct npu *npu;
-	int i, reg;
-
-	for (i = 0; i <= max_npu2_index; i++) {
-		reg = mmio_atsd_reg[i].reg;
-		if (reg < 0)
-			continue;
-
-		npu = mmio_atsd_reg[i].npu;
-		__raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
-	}
-}
-
-static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-				unsigned long pid)
-{
-	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
-
-	/* Invalidating the entire process doesn't use a va */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
-}
-
-static void mmio_invalidate_range(struct mmio_atsd_reg
-			mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
-			unsigned long start, unsigned long psize)
-{
-	unsigned long launch = get_atsd_launch_val(pid, psize);
-
-	/* Write all VAs first */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
-
-	/* Issue one barrier for all address writes */
-	eieio();
-
-	/* Launch */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
-}
-
-#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
-
-static void mmio_invalidate_wait(
-	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
-	struct npu *npu;
-	int i, reg;
-
-	/* Wait for all invalidations to complete */
-	for (i = 0; i <= max_npu2_index; i++) {
-		if (mmio_atsd_reg[i].reg < 0)
-			continue;
-
-		/* Wait for completion */
-		npu = mmio_atsd_reg[i].npu;
-		reg = mmio_atsd_reg[i].reg;
-		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
-			cpu_relax();
-	}
-}
-
-/*
- * Acquires all the address translation shootdown (ATSD) registers required to
- * launch an ATSD on all links this npu_context is active on.
- */
-static void acquire_atsd_reg(struct npu_context *npu_context,
-			struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
-	int i, j;
-	struct npu *npu;
-	struct pci_dev *npdev;
-
-	for (i = 0; i <= max_npu2_index; i++) {
-		mmio_atsd_reg[i].reg = -1;
-		for (j = 0; j < NV_MAX_LINKS; j++) {
-			/*
-			 * There are no ordering requirements with respect to
-			 * the setup of struct npu_context, but to ensure
-			 * consistent behaviour we need to ensure npdev[][] is
-			 * only read once.
-			 */
-			npdev = READ_ONCE(npu_context->npdev[i][j]);
-			if (!npdev)
-				continue;
-
-			npu = pci_bus_to_host(npdev->bus)->npu;
-			if (!npu)
-				continue;
-
-			mmio_atsd_reg[i].npu = npu;
-			mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
-			while (mmio_atsd_reg[i].reg < 0) {
-				mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
-				cpu_relax();
-			}
-			break;
-		}
-	}
-}
-
-/*
- * Release previously acquired ATSD registers. To avoid deadlocks the registers
- * must be released in the same order they were acquired above in
- * acquire_atsd_reg.
- */
-static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
-	int i;
-
-	for (i = 0; i <= max_npu2_index; i++) {
-		/*
-		 * We can't rely on npu_context->npdev[][] being the same here
-		 * as when acquire_atsd_reg() was called, hence we use the
-		 * values stored in mmio_atsd_reg during the acquire phase
-		 * rather than re-reading npdev[][].
-		 */
-		if (mmio_atsd_reg[i].reg < 0)
-			continue;
-
-		put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
-	}
-}
-
-/*
- * Invalidate a virtual address range
- */
-static void mmio_invalidate(struct npu_context *npu_context,
-			unsigned long start, unsigned long size)
-{
-	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
-	unsigned long pid = npu_context->mm->context.id;
-	unsigned long atsd_start = 0;
-	unsigned long end = start + size - 1;
-	int atsd_psize = MMU_PAGE_COUNT;
-
-	/*
-	 * Convert the input range into one of the supported sizes. If the range
-	 * doesn't fit, use the next larger supported size. Invalidation latency
-	 * is high, so over-invalidation is preferred to issuing multiple
-	 * invalidates.
-	 *
-	 * A 4K page size isn't supported by NPU/GPU ATS, so that case is
-	 * ignored.
-	 */
-	if (size == SZ_64K) {
-		atsd_start = start;
-		atsd_psize = MMU_PAGE_64K;
-	} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
-		atsd_start = ALIGN_DOWN(start, SZ_2M);
-		atsd_psize = MMU_PAGE_2M;
-	} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
-		atsd_start = ALIGN_DOWN(start, SZ_1G);
-		atsd_psize = MMU_PAGE_1G;
-	}
-
-	if (npu_context->nmmu_flush)
-		/*
-		 * Unfortunately the nest mmu does not support flushing specific
-		 * addresses so we have to flush the whole mm once before
-		 * shooting down the GPU translation.
-		 */
-		flush_all_mm(npu_context->mm);
-
-	/*
-	 * Loop over all the NPUs this process is active on and launch
-	 * an invalidate.
-	 */
-	acquire_atsd_reg(npu_context, mmio_atsd_reg);
-
-	if (atsd_psize == MMU_PAGE_COUNT)
-		mmio_invalidate_pid(mmio_atsd_reg, pid);
-	else
-		mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
-					atsd_psize);
-
-	mmio_invalidate_wait(mmio_atsd_reg);
-
-	/*
-	 * The GPU requires two flush ATSDs to ensure all entries have been
-	 * flushed. We use PID 0 as it will never be used for a process on the
-	 * GPU.
-	 */
-	mmio_invalidate_pid(mmio_atsd_reg, 0);
-	mmio_invalidate_wait(mmio_atsd_reg);
-	mmio_invalidate_pid(mmio_atsd_reg, 0);
-	mmio_invalidate_wait(mmio_atsd_reg);
-
-	release_atsd_reg(mmio_atsd_reg);
-}
-
-static void pnv_npu2_mn_release(struct mmu_notifier *mn,
-				struct mm_struct *mm)
-{
-	struct npu_context *npu_context = mn_to_npu_context(mn);
-
-	/* Call into device driver to stop requests to the NMMU */
-	if (npu_context->release_cb)
-		npu_context->release_cb(npu_context, npu_context->priv);
-
-	/*
-	 * There should be no more translation requests for this PID, but we
-	 * need to ensure any entries for it are removed from the TLB.
-	 */
-	mmio_invalidate(npu_context, 0, ~0UL);
-}
-
-static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long start, unsigned long end)
-{
-	struct npu_context *npu_context = mn_to_npu_context(mn);
-	mmio_invalidate(npu_context, start, end - start);
-}
-
-static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
-	.release = pnv_npu2_mn_release,
-	.invalidate_range = pnv_npu2_mn_invalidate_range,
-};
-
-/*
- * Call into OPAL to setup the nmmu context for the current task in
- * the NPU. This must be called to setup the context tables before the
- * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
- *
- * A release callback should be registered to allow a device driver to
- * be notified that it should not launch any new translation requests
- * as the final TLB invalidate is about to occur.
- *
- * Returns an error if there no contexts are currently available or a
- * npu_context which should be passed to pnv_npu2_handle_fault().
- *
- * mmap_sem must be held in write mode and must not be called from interrupt
- * context.
- */
-struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
-			unsigned long flags,
-			void (*cb)(struct npu_context *, void *),
-			void *priv)
-{
-	int rc;
-	u32 nvlink_index;
-	struct device_node *nvlink_dn;
-	struct mm_struct *mm = current->mm;
-	struct npu *npu;
-	struct npu_context *npu_context;
-	struct pci_controller *hose;
-
-	/*
-	 * At present we don't support GPUs connected to multiple NPUs and I'm
-	 * not sure the hardware does either.
-	 */
-	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-
-	if (!npdev)
-		/* No nvlink associated with this GPU device */
-		return ERR_PTR(-ENODEV);
-
-	/* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */
-	if (flags & ~(MSR_DR | MSR_PR | MSR_HV))
-		return ERR_PTR(-EINVAL);
-
-	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
-	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
-							&nvlink_index)))
-		return ERR_PTR(-ENODEV);
-
-	if (!mm || mm->context.id == 0) {
-		/*
-		 * Kernel thread contexts are not supported and context id 0 is
-		 * reserved on the GPU.
-		 */
-		return ERR_PTR(-EINVAL);
-	}
-
-	hose = pci_bus_to_host(npdev->bus);
-	npu = hose->npu;
-	if (!npu)
-		return ERR_PTR(-ENODEV);
-
-	/*
-	 * We store the npu pci device so we can more easily get at the
-	 * associated npus.
-	 */
-	spin_lock(&npu_context_lock);
-	npu_context = mm->context.npu_context;
-	if (npu_context) {
-		if (npu_context->release_cb != cb ||
-			npu_context->priv != priv) {
-			spin_unlock(&npu_context_lock);
-			return ERR_PTR(-EINVAL);
-		}
-
-		WARN_ON(!kref_get_unless_zero(&npu_context->kref));
-	}
-	spin_unlock(&npu_context_lock);
-
-	if (!npu_context) {
-		/*
-		 * We can set up these fields without holding the
-		 * npu_context_lock as the npu_context hasn't been returned to
-		 * the caller meaning it can't be destroyed. Parallel allocation
-		 * is protected against by mmap_sem.
-		 */
-		rc = -ENOMEM;
-		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
-		if (npu_context) {
-			kref_init(&npu_context->kref);
-			npu_context->mm = mm;
-			npu_context->mn.ops = &nv_nmmu_notifier_ops;
-			rc = __mmu_notifier_register(&npu_context->mn, mm);
-		}
-
-		if (rc) {
-			kfree(npu_context);
-			return ERR_PTR(rc);
-		}
-
-		mm->context.npu_context = npu_context;
-	}
-
-	npu_context->release_cb = cb;
-	npu_context->priv = priv;
-
-	/*
-	 * npdev is a pci_dev pointer setup by the PCI code. We assign it to
-	 * npdev[][] to indicate to the mmu notifiers that an invalidation
-	 * should also be sent over this nvlink. The notifiers don't use any
-	 * other fields in npu_context, so we just need to ensure that when they
-	 * deference npu_context->npdev[][] it is either a valid pointer or
-	 * NULL.
-	 */
-	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
-
-	if (!npu->nmmu_flush) {
-		/*
-		 * If we're not explicitly flushing ourselves we need to mark
-		 * the thread for global flushes
-		 */
-		npu_context->nmmu_flush = false;
-		mm_context_add_copro(mm);
-	} else
-		npu_context->nmmu_flush = true;
-
-	return npu_context;
-}
-EXPORT_SYMBOL(pnv_npu2_init_context);
-
-static void pnv_npu2_release_context(struct kref *kref)
-{
-	struct npu_context *npu_context =
-		container_of(kref, struct npu_context, kref);
-
-	if (!npu_context->nmmu_flush)
-		mm_context_remove_copro(npu_context->mm);
-
-	npu_context->mm->context.npu_context = NULL;
-}
-
-/*
- * Destroy a context on the given GPU. May free the npu_context if it is no
- * longer active on any GPUs. Must not be called from interrupt context.
- */
-void pnv_npu2_destroy_context(struct npu_context *npu_context,
-			struct pci_dev *gpdev)
-{
-	int removed;
-	struct npu *npu;
-	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-	struct device_node *nvlink_dn;
-	u32 nvlink_index;
-	struct pci_controller *hose;
-
-	if (WARN_ON(!npdev))
-		return;
-
-	hose = pci_bus_to_host(npdev->bus);
-	npu = hose->npu;
-	if (!npu)
-		return;
-	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
-	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
-							&nvlink_index)))
-		return;
-	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
-	spin_lock(&npu_context_lock);
-	removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
-	spin_unlock(&npu_context_lock);
-
-	/*
-	 * We need to do this outside of pnv_npu2_release_context so that it is
-	 * outside the spinlock as mmu_notifier_destroy uses SRCU.
-	 */
-	if (removed) {
-		mmu_notifier_unregister(&npu_context->mn,
-					npu_context->mm);
-
-		kfree(npu_context);
-	}
-
-}
-EXPORT_SYMBOL(pnv_npu2_destroy_context);
-
-/*
- * Assumes mmap_sem is held for the contexts associated mm.
- */
-int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
-			unsigned long *flags, unsigned long *status, int count)
-{
-	u64 rc = 0, result = 0;
-	int i, is_write;
-	struct page *page[1];
-	const char __user *u;
-	char c;
-
-	/* mmap_sem should be held so the struct_mm must be present */
-	struct mm_struct *mm = context->mm;
-
-	WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
-
-	for (i = 0; i < count; i++) {
-		is_write = flags[i] & NPU2_WRITE;
-		rc = get_user_pages_remote(NULL, mm, ea[i], 1,
-					is_write ? FOLL_WRITE : 0,
-					page, NULL, NULL);
-
-		if (rc != 1) {
-			status[i] = rc;
-			result = -EFAULT;
-			continue;
-		}
-
-		/* Make sure partition scoped tree gets a pte */
-		u = page_address(page[0]);
-		if (__get_user(c, u))
-			result = -EFAULT;
-
-		status[i] = 0;
-		put_page(page[0]);
-	}
-
-	return result;
-}
-EXPORT_SYMBOL(pnv_npu2_handle_fault);
-
 int pnv_npu2_init(struct pci_controller *hose)
 {
-	unsigned int i;
-	u64 mmio_atsd;
 	static int npu_index;
 	struct npu *npu;
 	int ret;
@@ -1160,33 +533,18 @@ int pnv_npu2_init(struct pci_controller *hose)
 	if (!npu)
 		return -ENOMEM;
 
-	npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
-
-	for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) &&
-			!of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
-				i, &mmio_atsd); i++)
-		npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
-
-	pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
-	npu->mmio_atsd_count = i;
-	npu->mmio_atsd_usage = 0;
 	npu_index++;
 	if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
 		ret = -ENOSPC;
 		goto fail_exit;
 	}
-	max_npu2_index = npu_index;
 	npu->index = npu_index;
 	hose->npu = npu;
 
 	return 0;
 
 fail_exit:
-	for (i = 0; i < npu->mmio_atsd_count; ++i)
-		iounmap(npu->mmio_atsd_regs[i]);
-
 	kfree(npu);
-
 	return ret;
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 36c8fa3647a2..a2aa5e433ac8 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info,		OPAL_XIVE_SET_QUEUE_INFO);
 OPAL_CALL(opal_xive_donate_page,		OPAL_XIVE_DONATE_PAGE);
 OPAL_CALL(opal_xive_alloc_vp_block,		OPAL_XIVE_ALLOCATE_VP_BLOCK);
 OPAL_CALL(opal_xive_free_vp_block,		OPAL_XIVE_FREE_VP_BLOCK);
-OPAL_CALL(opal_xive_allocate_irq,		OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_allocate_irq_raw,		OPAL_XIVE_ALLOCATE_IRQ);
 OPAL_CALL(opal_xive_free_irq,			OPAL_XIVE_FREE_IRQ);
 OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
@@ -273,7 +273,6 @@ OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
 OPAL_CALL(opal_imc_counters_init,		OPAL_IMC_COUNTERS_INIT);
 OPAL_CALL(opal_imc_counters_start,		OPAL_IMC_COUNTERS_START);
 OPAL_CALL(opal_imc_counters_stop,		OPAL_IMC_COUNTERS_STOP);
-OPAL_CALL(opal_pci_set_p2p,			OPAL_PCI_SET_P2P);
 OPAL_CALL(opal_get_powercap,			OPAL_GET_POWERCAP);
 OPAL_CALL(opal_set_powercap,			OPAL_SET_POWERCAP);
 OPAL_CALL(opal_get_power_shift_ratio,		OPAL_GET_POWER_SHIFT_RATIO);
@@ -288,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,		OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
 OPAL_CALL(opal_sensor_read_u64,			OPAL_SENSOR_READ_U64);
 OPAL_CALL(opal_sensor_group_enable,		OPAL_SENSOR_GROUP_ENABLE);
 OPAL_CALL(opal_nx_coproc_init,			OPAL_NX_COPROC_INIT);
+OPAL_CALL(opal_mpipl_update,			OPAL_MPIPL_UPDATE);
+OPAL_CALL(opal_mpipl_register_tag,		OPAL_MPIPL_REGISTER_TAG);
+OPAL_CALL(opal_mpipl_query_tag,			OPAL_MPIPL_QUERY_TAG);
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000000000000..ed895d82c048
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,636 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal core: " fmt
+
+#include <linux/memblock.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/crash_core.h>
+#include <linux/of.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+#define MAX_PT_LOAD_CNT		8
+
+/* NT_AUXV note related info */
+#define AUXV_CNT		1
+#define AUXV_DESC_SZ		(((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
+
+struct opalcore_config {
+	u32			num_cpus;
+	/* PIR value of crashing CPU */
+	u32			crashing_cpu;
+
+	/* CPU state data info from F/W */
+	u64			cpu_state_destination_vaddr;
+	u64			cpu_state_data_size;
+	u64			cpu_state_entry_size;
+
+	/* OPAL memory to be exported as PT_LOAD segments */
+	u64			ptload_addr[MAX_PT_LOAD_CNT];
+	u64			ptload_size[MAX_PT_LOAD_CNT];
+	u64			ptload_cnt;
+
+	/* Pointer to the first PT_LOAD in the ELF core file */
+	Elf64_Phdr		*ptload_phdr;
+
+	/* Total size of opalcore file. */
+	size_t			opalcore_size;
+
+	/* Buffer for all the ELF core headers and the PT_NOTE */
+	size_t			opalcorebuf_sz;
+	char			*opalcorebuf;
+
+	/* NT_AUXV buffer */
+	char			auxv_buf[AUXV_DESC_SZ];
+};
+
+struct opalcore {
+	struct list_head	list;
+	u64			paddr;
+	size_t			size;
+	loff_t			offset;
+};
+
+static LIST_HEAD(opalcore_list);
+static struct opalcore_config *oc_conf;
+static const struct opal_mpipl_fadump *opalc_metadata;
+static const struct opal_mpipl_fadump *opalc_cpu_metadata;
+
+/*
+ * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
+ * by kernel, SIGTERM otherwise.
+ */
+bool kernel_initiated;
+
+static struct opalcore * __init get_new_element(void)
+{
+	return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
+}
+
+static inline int is_opalcore_usable(void)
+{
+	return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
+}
+
+static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
+				     u32 type, void *data,
+				     size_t data_len)
+{
+	Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
+	Elf64_Word namesz = strlen(name) + 1;
+
+	note->n_namesz = cpu_to_be32(namesz);
+	note->n_descsz = cpu_to_be32(data_len);
+	note->n_type   = cpu_to_be32(type);
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
+	memcpy(buf, name, namesz);
+	buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
+
+	return buf;
+}
+
+static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
+			  struct pt_regs *regs)
+{
+	memset(prstatus, 0, sizeof(struct elf_prstatus));
+	elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs);
+
+	/*
+	 * Overload PID with PIR value.
+	 * As a PIR value could also be '0', add an offset of '100'
+	 * to every PIR to avoid misinterpretations in GDB.
+	 */
+	prstatus->pr_pid  = cpu_to_be32(100 + pir);
+	prstatus->pr_ppid = cpu_to_be32(1);
+
+	/*
+	 * Indicate SIGUSR1 for crash initiated from kernel.
+	 * SIGTERM otherwise.
+	 */
+	if (pir == oc_conf->crashing_cpu) {
+		short sig;
+
+		sig = kernel_initiated ? SIGUSR1 : SIGTERM;
+		prstatus->pr_cursig = cpu_to_be16(sig);
+	}
+}
+
+static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf,
+				       u64 opal_boot_entry)
+{
+	Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
+	int idx = 0;
+
+	memset(bufp, 0, AUXV_DESC_SZ);
+
+	/* Entry point of OPAL */
+	bufp[idx++] = cpu_to_be64(AT_ENTRY);
+	bufp[idx++] = cpu_to_be64(opal_boot_entry);
+
+	/* end of vector */
+	bufp[idx++] = cpu_to_be64(AT_NULL);
+
+	buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
+				oc_conf->auxv_buf, AUXV_DESC_SZ);
+	return buf;
+}
+
+/*
+ * Read from the ELF header and then the crash dump.
+ * Returns number of bytes read on success, -errno on failure.
+ */
+static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
+			     struct bin_attribute *bin_attr, char *to,
+			     loff_t pos, size_t count)
+{
+	struct opalcore *m;
+	ssize_t tsz, avail;
+	loff_t tpos = pos;
+
+	if (pos >= oc_conf->opalcore_size)
+		return 0;
+
+	/* Adjust count if it goes beyond opalcore size */
+	avail = oc_conf->opalcore_size - pos;
+	if (count > avail)
+		count = avail;
+
+	if (count == 0)
+		return 0;
+
+	/* Read ELF core header and/or PT_NOTE segment */
+	if (tpos < oc_conf->opalcorebuf_sz) {
+		tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
+		memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
+		to += tsz;
+		tpos += tsz;
+		count -= tsz;
+	}
+
+	list_for_each_entry(m, &opalcore_list, list) {
+		/* nothing more to read here */
+		if (count == 0)
+			break;
+
+		if (tpos < m->offset + m->size) {
+			void *addr;
+
+			tsz = min_t(size_t, m->offset + m->size - tpos, count);
+			addr = (void *)(m->paddr + tpos - m->offset);
+			memcpy(to, __va(addr), tsz);
+			to += tsz;
+			tpos += tsz;
+			count -= tsz;
+		}
+	}
+
+	return (tpos - pos);
+}
+
+static struct bin_attribute opal_core_attr = {
+	.attr = {.name = "core", .mode = 0400},
+	.read = read_opalcore
+};
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ *
+ * Each register entry is of 16 bytes, A numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation.
+ */
+static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
+{
+	u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+	struct hdat_fadump_thread_hdr *thdr;
+	struct elf_prstatus prstatus;
+	Elf64_Word *first_cpu_note;
+	struct pt_regs regs;
+	char *bufp;
+	int i;
+
+	size_per_thread = oc_conf->cpu_state_entry_size;
+	bufp = __va(oc_conf->cpu_state_destination_vaddr);
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", oc_conf->num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	/*
+	 * Skip past the first CPU note. Fill this note with the
+	 * crashing CPU's prstatus.
+	 */
+	first_cpu_note = buf;
+	buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+				&prstatus, sizeof(prstatus));
+
+	for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+		thread_pir = be32_to_cpu(thdr->pir);
+
+		pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+			 i, thread_pir, thdr->core_state);
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+			continue;
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, false, &regs);
+
+		pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
+			 be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
+		fill_prstatus(&prstatus, thread_pir, &regs);
+
+		if (thread_pir != oc_conf->crashing_cpu) {
+			buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
+						NT_PRSTATUS, &prstatus,
+						sizeof(prstatus));
+		} else {
+			/*
+			 * Add crashing CPU as the first NT_PRSTATUS note for
+			 * GDB to process the core file appropriately.
+			 */
+			append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
+					  NT_PRSTATUS, &prstatus,
+					  sizeof(prstatus));
+		}
+	}
+
+	return buf;
+}
+
+static int __init create_opalcore(void)
+{
+	u64 opal_boot_entry, opal_base_addr, paddr;
+	u32 hdr_size, cpu_notes_size, count;
+	struct device_node *dn;
+	struct opalcore *new;
+	loff_t opalcore_off;
+	struct page *page;
+	Elf64_Phdr *phdr;
+	Elf64_Ehdr *elf;
+	int i, ret;
+	char *bufp;
+
+	/* Get size of header & CPU notes for OPAL core */
+	hdr_size = (sizeof(Elf64_Ehdr) +
+		    ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
+	cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
+			  CRASH_CORE_NOTE_NAME_BYTES +
+			  CRASH_CORE_NOTE_DESC_BYTES)) +
+			  (CRASH_CORE_NOTE_HEAD_BYTES +
+			  CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
+
+	/* Allocate buffer to setup OPAL core */
+	oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
+	oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
+						 GFP_KERNEL | __GFP_ZERO);
+	if (!oc_conf->opalcorebuf) {
+		pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
+		       oc_conf->opalcorebuf_sz);
+		oc_conf->opalcorebuf_sz = 0;
+		return -ENOMEM;
+	}
+	count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
+	page = virt_to_page(oc_conf->opalcorebuf);
+	for (i = 0; i < count; i++)
+		mark_page_reserved(page + i);
+
+	pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
+
+	/* Read OPAL related device-tree entries */
+	dn = of_find_node_by_name(NULL, "ibm,opal");
+	if (dn) {
+		ret = of_property_read_u64(dn, "opal-base-address",
+					   &opal_base_addr);
+		pr_debug("opal-base-address: %llx\n", opal_base_addr);
+		ret |= of_property_read_u64(dn, "opal-boot-address",
+					    &opal_boot_entry);
+		pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
+	}
+	if (!dn || ret)
+		pr_warn("WARNING: Failed to read OPAL base & entry values\n");
+
+	/* Use count to keep track of the program headers */
+	count = 0;
+
+	bufp = oc_conf->opalcorebuf;
+	elf = (Elf64_Ehdr *)bufp;
+	bufp += sizeof(Elf64_Ehdr);
+	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+	elf->e_ident[EI_CLASS] = ELF_CLASS;
+	elf->e_ident[EI_DATA] = ELFDATA2MSB;
+	elf->e_ident[EI_VERSION] = EV_CURRENT;
+	elf->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+	elf->e_type = cpu_to_be16(ET_CORE);
+	elf->e_machine = cpu_to_be16(ELF_ARCH);
+	elf->e_version = cpu_to_be32(EV_CURRENT);
+	elf->e_entry = 0;
+	elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
+	elf->e_shoff = 0;
+	elf->e_flags = 0;
+
+	elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
+	elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
+	elf->e_phnum = 0;
+	elf->e_shentsize = 0;
+	elf->e_shnum = 0;
+	elf->e_shstrndx = 0;
+
+	phdr = (Elf64_Phdr *)bufp;
+	bufp += sizeof(Elf64_Phdr);
+	phdr->p_type	= cpu_to_be32(PT_NOTE);
+	phdr->p_flags	= 0;
+	phdr->p_align	= 0;
+	phdr->p_paddr	= phdr->p_vaddr = 0;
+	phdr->p_offset	= cpu_to_be64(hdr_size);
+	phdr->p_filesz	= phdr->p_memsz = cpu_to_be64(cpu_notes_size);
+	count++;
+
+	opalcore_off = oc_conf->opalcorebuf_sz;
+	oc_conf->ptload_phdr  = (Elf64_Phdr *)bufp;
+	paddr = 0;
+	for (i = 0; i < oc_conf->ptload_cnt; i++) {
+		phdr = (Elf64_Phdr *)bufp;
+		bufp += sizeof(Elf64_Phdr);
+		phdr->p_type	= cpu_to_be32(PT_LOAD);
+		phdr->p_flags	= cpu_to_be32(PF_R|PF_W|PF_X);
+		phdr->p_align	= 0;
+
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr  = oc_conf->ptload_addr[i];
+		new->size   = oc_conf->ptload_size[i];
+		new->offset = opalcore_off;
+		list_add_tail(&new->list, &opalcore_list);
+
+		phdr->p_paddr	= cpu_to_be64(paddr);
+		phdr->p_vaddr	= cpu_to_be64(opal_base_addr + paddr);
+		phdr->p_filesz	= phdr->p_memsz  =
+			cpu_to_be64(oc_conf->ptload_size[i]);
+		phdr->p_offset	= cpu_to_be64(opalcore_off);
+
+		count++;
+		opalcore_off += oc_conf->ptload_size[i];
+		paddr += oc_conf->ptload_size[i];
+	}
+
+	elf->e_phnum = cpu_to_be16(count);
+
+	bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
+	bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
+
+	oc_conf->opalcore_size = opalcore_off;
+	return 0;
+}
+
+static void opalcore_cleanup(void)
+{
+	if (oc_conf == NULL)
+		return;
+
+	/* Remove OPAL core sysfs file */
+	sysfs_remove_bin_file(opal_kobj, &opal_core_attr);
+	oc_conf->ptload_phdr = NULL;
+	oc_conf->ptload_cnt = 0;
+
+	/* free the buffer used for setting up OPAL core */
+	if (oc_conf->opalcorebuf) {
+		void *end = (void *)((u64)oc_conf->opalcorebuf +
+				     oc_conf->opalcorebuf_sz);
+
+		free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+		oc_conf->opalcorebuf = NULL;
+		oc_conf->opalcorebuf_sz = 0;
+	}
+
+	kfree(oc_conf);
+	oc_conf = NULL;
+}
+__exitcall(opalcore_cleanup);
+
+static void __init opalcore_config_init(void)
+{
+	u32 idx, cpu_data_version;
+	struct device_node *np;
+	const __be32 *prop;
+	u64 addr = 0;
+	int i, ret;
+
+	np = of_find_node_by_path("/ibm,opal/dump");
+	if (np == NULL)
+		return;
+
+	if (!of_device_is_compatible(np, "ibm,opal-dump")) {
+		pr_warn("Support missing for this f/w version!\n");
+		return;
+	}
+
+	/* Check if dump has been initiated on last reboot */
+	prop = of_get_property(np, "mpipl-boot", NULL);
+	if (!prop) {
+		of_node_put(np);
+		return;
+	}
+
+	/* Get OPAL metadata */
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get OPAL metadata (%d)\n", ret);
+		goto error_out;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("OPAL metadata addr: %llx\n", addr);
+	opalc_metadata = __va(addr);
+
+	/* Get OPAL CPU metadata */
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
+		goto error_out;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("CPU metadata addr: %llx\n", addr);
+	opalc_cpu_metadata = __va(addr);
+
+	/* Allocate memory for config buffer */
+	oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
+	if (oc_conf == NULL)
+		goto error_out;
+
+	/* Parse OPAL metadata */
+	if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
+		pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
+			OPAL_MPIPL_VERSION, opalc_metadata->version);
+		pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
+	}
+
+	oc_conf->ptload_cnt = 0;
+	idx = be32_to_cpu(opalc_metadata->region_cnt);
+	if (idx > MAX_PT_LOAD_CNT) {
+		pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
+			MAX_PT_LOAD_CNT, idx);
+		idx = MAX_PT_LOAD_CNT;
+	}
+	for (i = 0; i < idx; i++) {
+		oc_conf->ptload_addr[oc_conf->ptload_cnt] =
+				be64_to_cpu(opalc_metadata->region[i].dest);
+		oc_conf->ptload_size[oc_conf->ptload_cnt++] =
+				be64_to_cpu(opalc_metadata->region[i].size);
+	}
+	oc_conf->ptload_cnt = i;
+	oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
+
+	if (!oc_conf->ptload_cnt) {
+		pr_err("OPAL memory regions not found\n");
+		goto error_out;
+	}
+
+	/* Parse OPAL CPU metadata */
+	cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
+	if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+		pr_warn("Supported CPU data version: %u, found: %u!\n",
+			HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
+		pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+	}
+
+	addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
+	if (!addr) {
+		pr_err("CPU state data not found!\n");
+		goto error_out;
+	}
+	oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
+
+	oc_conf->cpu_state_data_size =
+			be64_to_cpu(opalc_cpu_metadata->region[0].size);
+	oc_conf->cpu_state_entry_size =
+			be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
+
+	if ((oc_conf->cpu_state_entry_size == 0) ||
+	    (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
+		pr_err("CPU state data is invalid.\n");
+		goto error_out;
+	}
+	oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
+			     oc_conf->cpu_state_entry_size);
+
+	of_node_put(np);
+	return;
+
+error_out:
+	pr_err("Could not export /sys/firmware/opal/core\n");
+	opalcore_cleanup();
+	of_node_put(np);
+}
+
+static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     const char *buf, size_t count)
+{
+	int input = -1;
+
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
+	if (input == 1) {
+		if (oc_conf == NULL) {
+			pr_err("'/sys/firmware/opal/core' file not accessible!\n");
+			return -EPERM;
+		}
+
+		/*
+		 * Take away '/sys/firmware/opal/core' and release all memory
+		 * used for exporting this file.
+		 */
+		opalcore_cleanup();
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore,
+						0200, NULL,
+						fadump_release_opalcore_store);
+
+static int __init opalcore_init(void)
+{
+	int rc = -1;
+
+	opalcore_config_init();
+
+	if (oc_conf == NULL)
+		return rc;
+
+	create_opalcore();
+
+	/*
+	 * If oc_conf->opalcorebuf= is set in the 2nd kernel,
+	 * then capture the dump.
+	 */
+	if (!(is_opalcore_usable())) {
+		pr_err("Failed to export /sys/firmware/opal/core\n");
+		opalcore_cleanup();
+		return rc;
+	}
+
+	/* Set OPAL core file size */
+	opal_core_attr.size = oc_conf->opalcore_size;
+
+	/* Export OPAL core sysfs file */
+	rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr);
+	if (rc != 0) {
+		pr_err("Failed to export /sys/firmware/opal/core\n");
+		opalcore_cleanup();
+		return rc;
+	}
+
+	rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr);
+	if (rc) {
+		pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n",
+			rc);
+	}
+
+	return 0;
+}
+fs_initcall(opalcore_init);
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
new file mode 100644
index 000000000000..d361d37d975f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+
+#ifdef CONFIG_PRESERVE_FA_DUMP
+/*
+ * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
+ * ensure crash data is preserved in hope that the subsequent memory
+ * preserving kernel boot is going to process this crash data.
+ */
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	const struct opal_fadump_mem_struct *opal_fdm_active;
+	const __be32 *prop;
+	unsigned long dn;
+	u64 addr = 0;
+	s64 ret;
+
+	dn = of_get_flat_dt_subnode_by_name(node, "dump");
+	if (dn == -FDT_ERR_NOTFOUND)
+		return;
+
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (!prop)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_debug("Could not get Kernel metadata (%lld)\n", ret);
+		return;
+	}
+
+	/*
+	 * Preserve memory only if kernel memory regions are registered
+	 * with f/w for MPIPL.
+	 */
+	addr = be64_to_cpu(addr);
+	pr_debug("Kernel metadata addr: %llx\n", addr);
+	opal_fdm_active = (void *)addr;
+	if (opal_fdm_active->registered_regions == 0)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get boot memory tag (%lld)\n", ret);
+		return;
+	}
+
+	/*
+	 * Memory below this address can be used for booting a
+	 * capture kernel or petitboot kernel. Preserve everything
+	 * above this address for processing crashdump.
+	 */
+	fadump_conf->boot_mem_top = be64_to_cpu(addr);
+	pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top);
+
+	pr_info("Firmware-assisted dump is active.\n");
+	fadump_conf->dump_active = 1;
+}
+
+#else /* CONFIG_PRESERVE_FA_DUMP */
+static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
+static struct opal_fadump_mem_struct *opal_fdm;
+
+#ifdef CONFIG_OPAL_CORE
+extern bool kernel_initiated;
+#endif
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf);
+
+static void opal_fadump_update_config(struct fw_dump *fadump_conf,
+				      const struct opal_fadump_mem_struct *fdm)
+{
+	pr_debug("Boot memory regions count: %d\n", fdm->region_cnt);
+
+	/*
+	 * The destination address of the first boot memory region is the
+	 * destination address of boot memory regions.
+	 */
+	fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest;
+	pr_debug("Destination address of boot memory regions: %#016llx\n",
+		 fadump_conf->boot_mem_dest_addr);
+
+	fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * from metadata setup by the first kernel.
+ */
+static void opal_fadump_get_config(struct fw_dump *fadump_conf,
+				   const struct opal_fadump_mem_struct *fdm)
+{
+	unsigned long base, size, last_end, hole_size;
+	int i;
+
+	if (!fadump_conf->dump_active)
+		return;
+
+	last_end = 0;
+	hole_size = 0;
+	fadump_conf->boot_memory_size = 0;
+
+	pr_debug("Boot memory regions:\n");
+	for (i = 0; i < fdm->region_cnt; i++) {
+		base = fdm->rgn[i].src;
+		size = fdm->rgn[i].size;
+		pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
+
+		fadump_conf->boot_mem_addr[i] = base;
+		fadump_conf->boot_mem_sz[i] = size;
+		fadump_conf->boot_memory_size += size;
+		hole_size += (base - last_end);
+
+		last_end = base + size;
+	}
+
+	/*
+	 * Start address of reserve dump area (permanent reservation) for
+	 * re-registering FADump after dump capture.
+	 */
+	fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
+
+	/*
+	 * Rarely, but it can so happen that system crashes before all
+	 * boot memory regions are registered for MPIPL. In such
+	 * cases, warn that the vmcore may not be accurate and proceed
+	 * anyway as that is the best bet considering free pages, cache
+	 * pages, user pages, etc are usually filtered out.
+	 *
+	 * Hope the memory that could not be preserved only has pages
+	 * that are usually filtered out while saving the vmcore.
+	 */
+	if (fdm->region_cnt > fdm->registered_regions) {
+		pr_warn("Not all memory regions were saved!!!\n");
+		pr_warn("  Unsaved memory regions:\n");
+		i = fdm->registered_regions;
+		while (i < fdm->region_cnt) {
+			pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
+				i, fdm->rgn[i].src, fdm->rgn[i].size);
+			i++;
+		}
+
+		pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n");
+		pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
+	}
+
+	fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
+	fadump_conf->boot_mem_regs_cnt = fdm->region_cnt;
+	opal_fadump_update_config(fadump_conf, fdm);
+}
+
+/* Initialize kernel metadata */
+static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
+{
+	fdm->version = OPAL_FADUMP_VERSION;
+	fdm->region_cnt = 0;
+	fdm->registered_regions = 0;
+	fdm->fadumphdr_addr = 0;
+}
+
+static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+	u64 addr = fadump_conf->reserve_dump_area_start;
+	int i;
+
+	opal_fdm = __va(fadump_conf->kernel_metadata);
+	opal_fadump_init_metadata(opal_fdm);
+
+	/* Boot memory regions */
+	for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
+		opal_fdm->rgn[i].src	= fadump_conf->boot_mem_addr[i];
+		opal_fdm->rgn[i].dest	= addr;
+		opal_fdm->rgn[i].size	= fadump_conf->boot_mem_sz[i];
+
+		opal_fdm->region_cnt++;
+		addr += fadump_conf->boot_mem_sz[i];
+	}
+
+	/*
+	 * Kernel metadata is passed to f/w and retrieved in capture kerenl.
+	 * So, use it to save fadump header address instead of calculating it.
+	 */
+	opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest +
+				    fadump_conf->boot_memory_size);
+
+	opal_fadump_update_config(fadump_conf, opal_fdm);
+
+	return addr;
+}
+
+static u64 opal_fadump_get_metadata_size(void)
+{
+	return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct));
+}
+
+static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
+{
+	int err = 0;
+	s64 ret;
+
+	/*
+	 * Use the last page(s) in FADump memory reservation for
+	 * kernel metadata.
+	 */
+	fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start +
+					fadump_conf->reserve_dump_area_size -
+					opal_fadump_get_metadata_size());
+	pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata);
+
+	/* Initialize kernel metadata before registering the address with f/w */
+	opal_fdm = __va(fadump_conf->kernel_metadata);
+	opal_fadump_init_metadata(opal_fdm);
+
+	/*
+	 * Register metadata address with f/w. Can be retrieved in
+	 * the capture kernel.
+	 */
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL,
+				      fadump_conf->kernel_metadata);
+	if (ret != OPAL_SUCCESS) {
+		pr_err("Failed to set kernel metadata tag!\n");
+		err = -EPERM;
+	}
+
+	/*
+	 * Register boot memory top address with f/w. Should be retrieved
+	 * by a kernel that intends to preserve crash'ed kernel's memory.
+	 */
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
+				      fadump_conf->boot_mem_top);
+	if (ret != OPAL_SUCCESS) {
+		pr_err("Failed to set boot memory tag!\n");
+		err = -EPERM;
+	}
+
+	return err;
+}
+
+static u64 opal_fadump_get_bootmem_min(void)
+{
+	return OPAL_FADUMP_MIN_BOOT_MEM;
+}
+
+static int opal_fadump_register(struct fw_dump *fadump_conf)
+{
+	s64 rc = OPAL_PARAMETER;
+	int i, err = -EIO;
+
+	for (i = 0; i < opal_fdm->region_cnt; i++) {
+		rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
+				       opal_fdm->rgn[i].src,
+				       opal_fdm->rgn[i].dest,
+				       opal_fdm->rgn[i].size);
+		if (rc != OPAL_SUCCESS)
+			break;
+
+		opal_fdm->registered_regions++;
+	}
+
+	switch (rc) {
+	case OPAL_SUCCESS:
+		pr_info("Registration is successful!\n");
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case OPAL_RESOURCE:
+		/* If MAX regions limit in f/w is hit, warn and proceed. */
+		pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
+			(opal_fdm->region_cnt - opal_fdm->registered_regions));
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case OPAL_PARAMETER:
+		pr_err("Failed to register. Parameter Error(%lld).\n", rc);
+		break;
+	case OPAL_HARDWARE:
+		pr_err("Support not available.\n");
+		fadump_conf->fadump_supported = 0;
+		fadump_conf->fadump_enabled = 0;
+		break;
+	default:
+		pr_err("Failed to register. Unknown Error(%lld).\n", rc);
+		break;
+	}
+
+	/*
+	 * If some regions were registered before OPAL_MPIPL_ADD_RANGE
+	 * OPAL call failed, unregister all regions.
+	 */
+	if ((err < 0) && (opal_fdm->registered_regions > 0))
+		opal_fadump_unregister(fadump_conf);
+
+	return err;
+}
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf)
+{
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to un-register - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	opal_fdm->registered_regions = 0;
+	fadump_conf->dump_registered = 0;
+	return 0;
+}
+
+static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+	s64 rc;
+
+	rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
+	if (rc) {
+		pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_active = 0;
+	opal_fdm_active = NULL;
+	return 0;
+}
+
+static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
+{
+	s64 ret;
+
+	ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0);
+	if (ret != OPAL_SUCCESS)
+		pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
+}
+
+/*
+ * Verify if CPU state data is available. If available, do a bit of sanity
+ * checking before processing this data.
+ */
+static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf)
+{
+	if (!opal_cpu_metadata)
+		return false;
+
+	fadump_conf->cpu_state_data_version =
+		be32_to_cpu(opal_cpu_metadata->cpu_data_version);
+	fadump_conf->cpu_state_entry_size =
+		be32_to_cpu(opal_cpu_metadata->cpu_data_size);
+	fadump_conf->cpu_state_dest_vaddr =
+		(u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest));
+	fadump_conf->cpu_state_data_size =
+		be64_to_cpu(opal_cpu_metadata->region[0].size);
+
+	if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+		pr_warn("Supported CPU state data version: %u, found: %d!\n",
+			HDAT_FADUMP_CPU_DATA_VER,
+			fadump_conf->cpu_state_data_version);
+		pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+	}
+
+	if ((fadump_conf->cpu_state_dest_vaddr == 0) ||
+	    (fadump_conf->cpu_state_entry_size == 0) ||
+	    (fadump_conf->cpu_state_entry_size >
+	     fadump_conf->cpu_state_data_size)) {
+		pr_err("CPU state data is invalid. Ignoring!\n");
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Convert CPU state data saved at the time of crash into ELF notes.
+ *
+ * While the crashing CPU's register data is saved by the kernel, CPU state
+ * data for all CPUs is saved by f/w. In CPU state data provided by f/w,
+ * each register entry is of 16 bytes, a numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation. If this data is
+ * missing or in unsupported format, append crashing CPU's register data
+ * saved by the kernel in the PT_NOTE, to have something to work with in
+ * the vmcore file.
+ */
+static int __init
+opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
+			    struct fadump_crash_info_header *fdh)
+{
+	u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+	struct hdat_fadump_thread_hdr *thdr;
+	bool is_cpu_data_valid = false;
+	u32 num_cpus = 1, *note_buf;
+	struct pt_regs regs;
+	char *bufp;
+	int rc, i;
+
+	if (is_opal_fadump_cpu_data_valid(fadump_conf)) {
+		size_per_thread = fadump_conf->cpu_state_entry_size;
+		num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread);
+		bufp = __va(fadump_conf->cpu_state_dest_vaddr);
+		is_cpu_data_valid = true;
+	}
+
+	rc = fadump_setup_cpu_notes_buf(num_cpus);
+	if (rc != 0)
+		return rc;
+
+	note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+	if (!is_cpu_data_valid)
+		goto out;
+
+	/*
+	 * Offset for register entries, entry size and registers count is
+	 * duplicated in every thread header in keeping with HDAT format.
+	 * Use these values from the first thread header.
+	 */
+	thdr = (struct hdat_fadump_thread_hdr *)bufp;
+	regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+		       be32_to_cpu(thdr->offset));
+	reg_esize = be32_to_cpu(thdr->esize);
+	regs_cnt  = be32_to_cpu(thdr->ecnt);
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+		 regs_offset, reg_esize, regs_cnt);
+
+	for (i = 0; i < num_cpus; i++, bufp += size_per_thread) {
+		thdr = (struct hdat_fadump_thread_hdr *)bufp;
+
+		thread_pir = be32_to_cpu(thdr->pir);
+		pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+			 i, thread_pir, thdr->core_state);
+
+		/*
+		 * If this is kernel initiated crash, crashing_cpu would be set
+		 * appropriately and register data of the crashing CPU saved by
+		 * crashing kernel. Add this saved register data of crashing CPU
+		 * to elf notes and populate the pt_regs for the remaining CPUs
+		 * from register state data provided by firmware.
+		 */
+		if (fdh->crashing_cpu == thread_pir) {
+			note_buf = fadump_regs_to_elf_notes(note_buf,
+							    &fdh->regs);
+			pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+				 fdh->crashing_cpu, fdh->regs.gpr[1],
+				 fdh->regs.nip);
+			continue;
+		}
+
+		/*
+		 * Register state data of MAX cores is provided by firmware,
+		 * but some of this cores may not be active. So, while
+		 * processing register state data, check core state and
+		 * skip threads that belong to inactive cores.
+		 */
+		if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+			continue;
+
+		opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+				      reg_esize, true, &regs);
+		note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+			 thread_pir, regs.gpr[1], regs.nip);
+	}
+
+out:
+	/*
+	 * CPU state data is invalid/unsupported. Try appending crashing CPU's
+	 * register data, if it is saved by the kernel.
+	 */
+	if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) {
+		if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) {
+			fadump_free_cpu_notes_buf();
+			return -ENODEV;
+		}
+
+		pr_warn("WARNING: appending only crashing CPU's register data\n");
+		note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+	}
+
+	final_note(note_buf);
+
+	pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+		 fdh->elfcorehdr_addr);
+	fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+	return 0;
+}
+
+static int __init opal_fadump_process(struct fw_dump *fadump_conf)
+{
+	struct fadump_crash_info_header *fdh;
+	int rc = -EINVAL;
+
+	if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+		return rc;
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fadump_conf->fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		pr_err("Crash info header is not valid.\n");
+		return rc;
+	}
+
+#ifdef CONFIG_OPAL_CORE
+	/*
+	 * If this is a kernel initiated crash, crashing_cpu would be set
+	 * appropriately and register data of the crashing CPU saved by
+	 * crashing kernel. Add this saved register data of crashing CPU
+	 * to elf notes and populate the pt_regs for the remaining CPUs
+	 * from register state data provided by firmware.
+	 */
+	if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)
+		kernel_initiated = true;
+#endif
+
+	rc = opal_fadump_build_cpu_notes(fadump_conf, fdh);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return rc;
+}
+
+static void opal_fadump_region_show(struct fw_dump *fadump_conf,
+				    struct seq_file *m)
+{
+	const struct opal_fadump_mem_struct *fdm_ptr;
+	u64 dumped_bytes = 0;
+	int i;
+
+	if (fadump_conf->dump_active)
+		fdm_ptr = opal_fdm_active;
+	else
+		fdm_ptr = opal_fdm;
+
+	for (i = 0; i < fdm_ptr->region_cnt; i++) {
+		/*
+		 * Only regions that are registered for MPIPL
+		 * would have dump data.
+		 */
+		if ((fadump_conf->dump_active) &&
+		    (i < fdm_ptr->registered_regions))
+			dumped_bytes = fdm_ptr->rgn[i].size;
+
+		seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+			   fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
+		seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+			   fdm_ptr->rgn[i].size, dumped_bytes);
+	}
+
+	/* Dump is active. Show reserved area start address. */
+	if (fadump_conf->dump_active) {
+		seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+			   fadump_conf->reserve_dump_area_start);
+	}
+}
+
+static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
+				const char *msg)
+{
+	int rc;
+
+	/*
+	 * Unlike on pSeries platform, logical CPU number is not provided
+	 * with architected register state data. So, store the crashing
+	 * CPU's PIR instead to plug the appropriate register data for
+	 * crashing CPU in the vmcore file.
+	 */
+	fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
+
+	rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
+	if (rc == OPAL_UNSUPPORTED) {
+		pr_emerg("Reboot type %d not supported.\n",
+			 OPAL_REBOOT_MPIPL);
+	} else if (rc == OPAL_HARDWARE)
+		pr_emerg("No backend support for MPIPL!\n");
+}
+
+static struct fadump_ops opal_fadump_ops = {
+	.fadump_init_mem_struct		= opal_fadump_init_mem_struct,
+	.fadump_get_metadata_size	= opal_fadump_get_metadata_size,
+	.fadump_setup_metadata		= opal_fadump_setup_metadata,
+	.fadump_get_bootmem_min		= opal_fadump_get_bootmem_min,
+	.fadump_register		= opal_fadump_register,
+	.fadump_unregister		= opal_fadump_unregister,
+	.fadump_invalidate		= opal_fadump_invalidate,
+	.fadump_cleanup			= opal_fadump_cleanup,
+	.fadump_process			= opal_fadump_process,
+	.fadump_region_show		= opal_fadump_region_show,
+	.fadump_trigger			= opal_fadump_trigger,
+};
+
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	const __be32 *prop;
+	unsigned long dn;
+	u64 addr = 0;
+	int i, len;
+	s64 ret;
+
+	/*
+	 * Check if Firmware-Assisted Dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	dn = of_get_flat_dt_subnode_by_name(node, "dump");
+	if (dn == -FDT_ERR_NOTFOUND) {
+		pr_debug("FADump support is missing!\n");
+		return;
+	}
+
+	if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) {
+		pr_err("Support missing for this f/w version!\n");
+		return;
+	}
+
+	prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
+	if (prop) {
+		/*
+		 * Each f/w load area is an (address,size) pair,
+		 * 2 cells each, totalling 4 cells per range.
+		 */
+		for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+			u64 base, end;
+
+			base = of_read_number(prop + (i * 4) + 0, 2);
+			end = base;
+			end += of_read_number(prop + (i * 4) + 2, 2);
+			if (end > OPAL_FADUMP_MIN_BOOT_MEM) {
+				pr_err("F/W load area: 0x%llx-0x%llx\n",
+				       base, end);
+				pr_err("F/W version not supported!\n");
+				return;
+			}
+		}
+	}
+
+	fadump_conf->ops		= &opal_fadump_ops;
+	fadump_conf->fadump_supported	= 1;
+
+	/*
+	 * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
+	 * and request firmware to copy multiple kernel boot memory regions.
+	 */
+	fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+
+	/*
+	 * Check if dump has been initiated on last reboot.
+	 */
+	prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+	if (!prop)
+		return;
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+	if ((ret != OPAL_SUCCESS) || !addr) {
+		pr_err("Failed to get Kernel metadata (%lld)\n", ret);
+		return;
+	}
+
+	addr = be64_to_cpu(addr);
+	pr_debug("Kernel metadata addr: %llx\n", addr);
+
+	opal_fdm_active = __va(addr);
+	if (opal_fdm_active->version != OPAL_FADUMP_VERSION) {
+		pr_warn("Supported kernel metadata version: %u, found: %d!\n",
+			OPAL_FADUMP_VERSION, opal_fdm_active->version);
+		pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n");
+	}
+
+	/* Kernel regions not registered with f/w for MPIPL */
+	if (opal_fdm_active->registered_regions == 0) {
+		opal_fdm_active = NULL;
+		return;
+	}
+
+	ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+	if (addr) {
+		addr = be64_to_cpu(addr);
+		pr_debug("CPU metadata addr: %llx\n", addr);
+		opal_cpu_metadata = __va(addr);
+	}
+
+	pr_info("Firmware-assisted dump is active.\n");
+	fadump_conf->dump_active = 1;
+	opal_fadump_get_config(fadump_conf, opal_fdm_active);
+}
+#endif /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
new file mode 100644
index 000000000000..f1e9ecf548c5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _POWERNV_OPAL_FADUMP_H
+#define _POWERNV_OPAL_FADUMP_H
+
+#include <asm/reg.h>
+
+/*
+ * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
+ * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
+ * mess with crash'ed kernel's memory during MPIPL.
+ */
+#define OPAL_FADUMP_MIN_BOOT_MEM		(0x30000000UL)
+
+/*
+ * OPAL FADump metadata structure format version
+ *
+ * OPAL FADump kernel metadata structure stores kernel metadata needed to
+ * register-for/process crash dump. Format version is used to keep a tab on
+ * the changes in the structure format. The changes, if any, to the format
+ * are expected to be minimal and backward compatible.
+ */
+#define OPAL_FADUMP_VERSION			0x1
+
+/*
+ * OPAL FADump kernel metadata
+ *
+ * The address of this structure will be registered with f/w for retrieving
+ * and processing during crash dump.
+ */
+struct opal_fadump_mem_struct {
+	u8	version;
+	u8	reserved[3];
+	u16	region_cnt;		/* number of regions */
+	u16	registered_regions;	/* Regions registered for MPIPL */
+	u64	fadumphdr_addr;
+	struct opal_mpipl_region	rgn[FADUMP_MAX_MEM_REGS];
+} __packed;
+
+/*
+ * CPU state data
+ *
+ * CPU state data information is provided by f/w. The format for this data
+ * is defined in the HDAT spec. Version is used to keep a tab on the changes
+ * in this CPU state data format. Changes to this format are unlikely, but
+ * if there are any changes, please refer to latest HDAT specification.
+ */
+#define HDAT_FADUMP_CPU_DATA_VER		1
+
+#define HDAT_FADUMP_CORE_INACTIVE		(0x0F)
+
+/* HDAT thread header for register entries */
+struct hdat_fadump_thread_hdr {
+	__be32  pir;
+	/* 0x00 - 0x0F - The corresponding stop state of the core */
+	u8      core_state;
+	u8      reserved[3];
+
+	__be32	offset;	/* Offset to Register Entries array */
+	__be32	ecnt;	/* Number of entries */
+	__be32	esize;	/* Alloc size of each array entry in bytes */
+	__be32	eactsz;	/* Actual size of each array entry in bytes */
+} __packed;
+
+/* Register types populated by f/w */
+#define HDAT_FADUMP_REG_TYPE_GPR		0x01
+#define HDAT_FADUMP_REG_TYPE_SPR		0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define HDAT_FADUMP_REG_ID_NIP			0x7D0
+#define HDAT_FADUMP_REG_ID_MSR			0x7D1
+#define HDAT_FADUMP_REG_ID_CCR			0x7D2
+
+/* HDAT register entry. */
+struct hdat_fadump_reg_entry {
+	__be32		reg_type;
+	__be32		reg_num;
+	__be64		reg_val;
+} __packed;
+
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+						 u32 reg_type, u32 reg_num,
+						 u64 reg_val)
+{
+	if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+		if (reg_num < 32)
+			regs->gpr[reg_num] = reg_val;
+		return;
+	}
+
+	switch (reg_num) {
+	case SPRN_CTR:
+		regs->ctr = reg_val;
+		break;
+	case SPRN_LR:
+		regs->link = reg_val;
+		break;
+	case SPRN_XER:
+		regs->xer = reg_val;
+		break;
+	case SPRN_DAR:
+		regs->dar = reg_val;
+		break;
+	case SPRN_DSISR:
+		regs->dsisr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_NIP:
+		regs->nip = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_MSR:
+		regs->msr = reg_val;
+		break;
+	case HDAT_FADUMP_REG_ID_CCR:
+		regs->ccr = reg_val;
+		break;
+	}
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+					 unsigned int reg_entry_size,
+					 bool cpu_endian,
+					 struct pt_regs *regs)
+{
+	struct hdat_fadump_reg_entry *reg_entry;
+	u64 val;
+	int i;
+
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+		reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+		val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
+		       reg_entry->reg_val);
+		opal_fadump_set_regval_regnum(regs,
+					      be32_to_cpu(reg_entry->reg_type),
+					      be32_to_cpu(reg_entry->reg_num),
+					      val);
+	}
+}
+
+#endif /* _POWERNV_OPAL_FADUMP_H */
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index 5cae375525d0..3e1f064a18db 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -137,6 +137,43 @@ static void print_nx_checkstop_reason(const char *level,
 					xstop_reason[i].description);
 }
 
+static void print_npu_checkstop_reason(const char *level,
+					struct OpalHMIEvent *hmi_evt)
+{
+	uint8_t reason, reason_count, i;
+
+	/*
+	 * We may not have a checkstop reason on some combination of
+	 * hardware and/or skiboot version
+	 */
+	if (!hmi_evt->u.xstop_error.xstop_reason) {
+		printk("%s	NPU checkstop on chip %x\n", level,
+			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+		return;
+	}
+
+	/*
+	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
+	 *   2 bits for the FIR number
+	 *   6 bits for the bit number
+	 * It may be possible to find several reasons.
+	 *
+	 * We don't display a specific message per FIR bit as there
+	 * are too many and most are meaningless without the workbook
+	 * and/or hw team help anyway.
+	 */
+	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
+		sizeof(reason);
+	for (i = 0; i < reason_count; i++) {
+		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
+		if (reason)
+			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
+				level,
+				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
+				reason >> 6, reason & 0x3F);
+	}
+}
+
 static void print_checkstop_reason(const char *level,
 					struct OpalHMIEvent *hmi_evt)
 {
@@ -148,6 +185,9 @@ static void print_checkstop_reason(const char *level,
 	case CHECKSTOP_TYPE_NX:
 		print_nx_checkstop_reason(level, hmi_evt);
 		break;
+	case CHECKSTOP_TYPE_NPU:
+		print_npu_checkstop_reason(level, hmi_evt);
+		break;
 	default:
 		printk("%s	Unknown Malfunction Alert of type %d\n",
 		       level, type);
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..e04b20625cb9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node,
 				    struct imc_pmu *pmu_ptr)
 {
 	static u64 loc, *imc_mode_addr, *imc_cmd_addr;
-	int chip = 0, nid;
 	char mode[16], cmd[16];
 	u32 cb_offset;
+	struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
 	imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
@@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node,
 	if (of_property_read_u32(node, "cb_offset", &cb_offset))
 		cb_offset = IMC_CNTL_BLK_OFFSET;
 
-	for_each_node(nid) {
-		loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+	while (ptr->vbase != NULL) {
+		loc = (u64)(ptr->vbase) + cb_offset;
 		imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
-		sprintf(mode, "imc_mode_%d", nid);
+		sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
 		if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
 					    imc_mode_addr))
 			goto err;
 
 		imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
-		sprintf(cmd, "imc_cmd_%d", nid);
+		sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
 		if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
 					    imc_cmd_addr))
 			goto err;
-		chip++;
+		ptr++;
 	}
 	return;
 
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index dc51d03c6370..d26da19a611f 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -29,23 +29,23 @@ struct memcons {
 
 static struct memcons *opal_memcons = NULL;
 
-ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count)
 {
 	const char *conbuf;
 	ssize_t ret;
 	size_t first_read = 0;
 	uint32_t out_pos, avail;
 
-	if (!opal_memcons)
+	if (!mc)
 		return -ENODEV;
 
-	out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
+	out_pos = be32_to_cpu(READ_ONCE(mc->out_pos));
 
 	/* Now we've read out_pos, put a barrier in before reading the new
 	 * data it points to in conbuf. */
 	smp_rmb();
 
-	conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys));
+	conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
 
 	/* When the buffer has wrapped, read from the out_pos marker to the end
 	 * of the buffer, and then read the remaining data as in the un-wrapped
@@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 	if (out_pos & MEMCONS_OUT_POS_WRAP) {
 
 		out_pos &= MEMCONS_OUT_POS_MASK;
-		avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos;
+		avail = be32_to_cpu(mc->obuf_size) - out_pos;
 
 		ret = memory_read_from_buffer(to, count, &pos,
 				conbuf + out_pos, avail);
@@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 	}
 
 	/* Sanity check. The firmware should not do this to us. */
-	if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) {
+	if (out_pos > be32_to_cpu(mc->obuf_size)) {
 		pr_err("OPAL: memory console corruption. Aborting read.\n");
 		return -EINVAL;
 	}
@@ -86,6 +86,11 @@ out:
 	return ret;
 }
 
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+{
+	return memcons_copy(opal_memcons, to, pos, count);
+}
+
 static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
 				struct bin_attribute *bin_attr, char *to,
 				loff_t pos, size_t count)
@@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = {
 	.read = opal_msglog_read
 };
 
-void __init opal_msglog_init(void)
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name)
 {
 	u64 mcaddr;
 	struct memcons *mc;
 
-	if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
-		pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
-		return;
+	if (of_property_read_u64(node, mc_prop_name, &mcaddr)) {
+		pr_warn("%s property not found, no message log\n",
+			mc_prop_name);
+		goto out_err;
 	}
 
 	mc = phys_to_virt(mcaddr);
 	if (!mc) {
-		pr_warn("OPAL: memory console address is invalid\n");
-		return;
+		pr_warn("memory console address is invalid\n");
+		goto out_err;
 	}
 
 	if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
-		pr_warn("OPAL: memory console version is invalid\n");
-		return;
+		pr_warn("memory console version is invalid\n");
+		goto out_err;
 	}
 
-	/* Report maximum size */
-	opal_msglog_attr.size =  be32_to_cpu(mc->ibuf_size) +
-		be32_to_cpu(mc->obuf_size);
+	return mc;
+
+out_err:
+	return NULL;
+}
+
+u32 memcons_get_size(struct memcons *mc)
+{
+	return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
+}
+
+void __init opal_msglog_init(void)
+{
+	opal_memcons = memcons_init(opal_node, "ibm,opal-memcons");
+	if (!opal_memcons) {
+		pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n");
+		return;
+	}
 
-	opal_memcons = mc;
+	opal_msglog_attr.size = memcons_get_size(opal_memcons);
 }
 
 void __init opal_msglog_sysfs_init(void)
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index e072bf157d62..45f4223a790f 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
 	int msg_size, item_size;
 	unsigned long flags;
 
-	if (msg_type != OPAL_MSG_PRD)
+	if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2)
 		return 0;
 
 	/* Calculate total size of the message and item we need to store. The
@@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev)
 		return rc;
 	}
 
+	rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb);
+	if (rc) {
+		pr_err("Couldn't register PRD2 event notifier\n");
+		return rc;
+	}
+
 	rc = misc_register(&opal_prd_dev);
 	if (rc) {
 		pr_err("failed to register miscdev\n");
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 66430eebe869..fd510d961b8c 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * PowerNV LPC bus handling.
+ * PowerNV SCOM bus debugfs interface
  *
+ * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
+ *                <benh@kernel.crashing.org>
+ *     and        David Gibson, IBM Corporation.
  * Copyright 2013 IBM Corp.
  */
 
@@ -10,62 +13,13 @@
 #include <linux/bug.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/opal.h>
-#include <asm/scom.h>
-
-/*
- * We could probably fit that inside the scom_map_t
- * which is a void* after all but it's really too ugly
- * so let's kmalloc it for now
- */
-struct opal_scom_map {
-	uint32_t chip;
-	uint64_t addr;
-};
-
-static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
-{
-	struct opal_scom_map *m;
-	const __be32 *gcid;
-
-	if (!of_get_property(dev, "scom-controller", NULL)) {
-		pr_err("%s: device %pOF is not a SCOM controller\n",
-			__func__, dev);
-		return SCOM_MAP_INVALID;
-	}
-	gcid = of_get_property(dev, "ibm,chip-id", NULL);
-	if (!gcid) {
-		pr_err("%s: device %pOF has no ibm,chip-id\n",
-			__func__, dev);
-		return SCOM_MAP_INVALID;
-	}
-	m = kmalloc(sizeof(*m), GFP_KERNEL);
-	if (!m)
-		return NULL;
-	m->chip = be32_to_cpup(gcid);
-	m->addr = reg;
-
-	return (scom_map_t)m;
-}
-
-static void opal_scom_unmap(scom_map_t map)
-{
-	kfree(map);
-}
-
-static int opal_xscom_err_xlate(int64_t rc)
-{
-	switch(rc) {
-	case 0:
-		return 0;
-	/* Add more translations if necessary */
-	default:
-		return -EIO;
-	}
-}
+#include <asm/debugfs.h>
+#include <asm/prom.h>
 
 static u64 opal_scom_unmangle(u64 addr)
 {
@@ -98,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr)
 	return addr;
 }
 
-static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value)
 {
-	struct opal_scom_map *m = map;
 	int64_t rc;
 	__be64 v;
 
-	reg = opal_scom_unmangle(m->addr + reg);
-	rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+	reg = opal_scom_unmangle(addr + reg);
+	rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v));
+	if (rc) {
+		*value = 0xfffffffffffffffful;
+		return -EIO;
+	}
 	*value = be64_to_cpu(v);
-	return opal_xscom_err_xlate(rc);
+	return 0;
 }
 
-static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
+static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value)
 {
-	struct opal_scom_map *m = map;
 	int64_t rc;
 
-	reg = opal_scom_unmangle(m->addr + reg);
-	rc = opal_xscom_write(m->chip, reg, value);
-	return opal_xscom_err_xlate(rc);
+	reg = opal_scom_unmangle(addr + reg);
+	rc = opal_xscom_write(chip, reg, value);
+	if (rc)
+		return -EIO;
+	return 0;
+}
+
+struct scom_debug_entry {
+	u32 chip;
+	struct debugfs_blob_wrapper path;
+	char name[16];
+};
+
+static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	struct scom_debug_entry *ent = filp->private_data;
+	u64 __user *ubuf64 = (u64 __user *)ubuf;
+	loff_t off = *ppos;
+	ssize_t done = 0;
+	u64 reg, reg_base, reg_cnt, val;
+	int rc;
+
+	if (off < 0 || (off & 7) || (count & 7))
+		return -EINVAL;
+	reg_base = off >> 3;
+	reg_cnt = count >> 3;
+
+	for (reg = 0; reg < reg_cnt; reg++) {
+		rc = opal_scom_read(ent->chip, reg_base, reg, &val);
+		if (!rc)
+			rc = put_user(val, ubuf64);
+		if (rc) {
+			if (!done)
+				done = rc;
+			break;
+		}
+		ubuf64++;
+		*ppos += 8;
+		done += 8;
+	}
+	return done;
+}
+
+static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf,
+				size_t count, loff_t *ppos)
+{
+	struct scom_debug_entry *ent = filp->private_data;
+	u64 __user *ubuf64 = (u64 __user *)ubuf;
+	loff_t off = *ppos;
+	ssize_t done = 0;
+	u64 reg, reg_base, reg_cnt, val;
+	int rc;
+
+	if (off < 0 || (off & 7) || (count & 7))
+		return -EINVAL;
+	reg_base = off >> 3;
+	reg_cnt = count >> 3;
+
+	for (reg = 0; reg < reg_cnt; reg++) {
+		rc = get_user(val, ubuf64);
+		if (!rc)
+			rc = opal_scom_write(ent->chip, reg_base, reg,  val);
+		if (rc) {
+			if (!done)
+				done = rc;
+			break;
+		}
+		ubuf64++;
+		done += 8;
+	}
+	return done;
 }
 
-static const struct scom_controller opal_scom_controller = {
-	.map	= opal_scom_map,
-	.unmap	= opal_scom_unmap,
-	.read	= opal_scom_read,
-	.write	= opal_scom_write
+static const struct file_operations scom_debug_fops = {
+	.read =		scom_debug_read,
+	.write =	scom_debug_write,
+	.open =		simple_open,
+	.llseek =	default_llseek,
 };
 
-static int opal_xscom_init(void)
+static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
+			       int chip)
 {
-	if (firmware_has_feature(FW_FEATURE_OPAL))
-		scom_init(&opal_scom_controller);
+	struct scom_debug_entry *ent;
+	struct dentry *dir;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return -ENOMEM;
+
+	ent->chip = chip;
+	snprintf(ent->name, 16, "%08x", chip);
+	ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
+	ent->path.size = strlen((char *)ent->path.data);
+
+	dir = debugfs_create_dir(ent->name, root);
+	if (!dir) {
+		kfree(ent->path.data);
+		kfree(ent);
+		return -1;
+	}
+
+	debugfs_create_blob("devspec", 0400, dir, &ent->path);
+	debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
+
 	return 0;
 }
-machine_arch_initcall(powernv, opal_xscom_init);
+
+static int scom_debug_init(void)
+{
+	struct device_node *dn;
+	struct dentry *root;
+	int chip, rc;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return 0;
+
+	root = debugfs_create_dir("scom", powerpc_debugfs_root);
+	if (!root)
+		return -1;
+
+	rc = 0;
+	for_each_node_with_property(dn, "scom-controller") {
+		chip = of_get_ibm_chip_id(dn);
+		WARN_ON(chip == -1);
+		rc |= scom_debug_init_one(root, dn, chip);
+	}
+
+	return rc;
+}
+device_initcall(scom_debug_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 98c5d94b17fb..38e90270280b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock);
 static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
 static uint32_t opal_heartbeat;
 static struct task_struct *kopald_tsk;
+static struct opal_msg *opal_msg;
+static u32 opal_msg_size __ro_after_init;
 
 void opal_configure_cores(void)
 {
@@ -202,16 +204,18 @@ static int __init opal_register_exception_handlers(void)
 	glue = 0x7000;
 
 	/*
-	 * Check if we are running on newer firmware that exports
-	 * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch
-	 * the HMI interrupt and we catch it directly in Linux.
+	 * Only ancient OPAL firmware requires this.
+	 * Specifically, firmware from FW810.00 (released June 2014)
+	 * through FW810.20 (Released October 2014).
 	 *
-	 * For older firmware (i.e currently released POWER8 System Firmware
-	 * as of today <= SV810_087), we fallback to old behavior and let OPAL
-	 * patch the HMI vector and handle it inside OPAL firmware.
+	 * Check if we are running on newer (post Oct 2014) firmware that
+	 * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
+	 * patch the HMI interrupt and we catch it directly in Linux.
 	 *
-	 * For newer firmware (in development/yet to be released) we will
-	 * start catching/handling HMI directly in Linux.
+	 * For older firmware (i.e < FW810.20), we fallback to old behavior and
+	 * let OPAL patch the HMI vector and handle it inside OPAL firmware.
+	 *
+	 * For newer firmware we catch/handle the HMI directly in Linux.
 	 */
 	if (!opal_check_token(OPAL_HANDLE_HMI)) {
 		pr_info("Old firmware detected, OPAL handles HMIs.\n");
@@ -221,6 +225,11 @@ static int __init opal_register_exception_handlers(void)
 		glue += 128;
 	}
 
+	/*
+	 * Only applicable to ancient firmware, all modern
+	 * (post March 2015/skiboot 5.0) firmware will just return
+	 * OPAL_UNSUPPORTED.
+	 */
 	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
 #endif
 
@@ -264,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg)
 static void opal_handle_message(void)
 {
 	s64 ret;
-	/*
-	 * TODO: pre-allocate a message buffer depending on opal-msg-size
-	 * value in /proc/device-tree.
-	 */
-	static struct opal_msg msg;
 	u32 type;
 
-	ret = opal_get_msg(__pa(&msg), sizeof(msg));
+	ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
 	/* No opal message pending. */
 	if (ret == OPAL_RESOURCE)
 		return;
@@ -283,14 +287,14 @@ static void opal_handle_message(void)
 		return;
 	}
 
-	type = be32_to_cpu(msg.msg_type);
+	type = be32_to_cpu(opal_msg->msg_type);
 
 	/* Sanity check */
 	if (type >= OPAL_MSG_TYPE_MAX) {
 		pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
 		return;
 	}
-	opal_message_do_notify(type, (void *)&msg);
+	opal_message_do_notify(type, (void *)opal_msg);
 }
 
 static irqreturn_t opal_message_notify(int irq, void *data)
@@ -299,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int __init opal_message_init(void)
+static int __init opal_message_init(struct device_node *opal_node)
 {
 	int ret, i, irq;
 
+	ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
+	if (ret) {
+		pr_notice("Failed to read opal-msg-size property\n");
+		opal_msg_size = sizeof(struct opal_msg);
+	}
+
+	opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+	if (!opal_msg) {
+		opal_msg_size = sizeof(struct opal_msg);
+		/* Try to allocate fixed message size */
+		opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+		BUG_ON(opal_msg == NULL);
+	}
+
 	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
 
@@ -698,7 +716,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
 				       bin_attr->size);
 }
 
-static BIN_ATTR_RO(symbol_map, 0);
+static struct bin_attribute symbol_map_attr = {
+	.attr = {.name = "symbol_map", .mode = 0400},
+	.read = symbol_map_read
+};
 
 static void opal_export_symmap(void)
 {
@@ -715,10 +736,10 @@ static void opal_export_symmap(void)
 		return;
 
 	/* Setup attributes */
-	bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0]));
-	bin_attr_symbol_map.size = be64_to_cpu(syms[1]);
+	symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
+	symbol_map_attr.size = be64_to_cpu(syms[1]);
 
-	rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map);
+	rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
 	if (rc)
 		pr_warn("Error %d creating OPAL symbols file\n", rc);
 }
@@ -903,7 +924,7 @@ static int __init opal_init(void)
 	}
 
 	/* Initialise OPAL messaging system */
-	opal_message_init();
+	opal_message_init(opal_node);
 
 	/* Initialise OPAL asynchronous completion interface */
 	opal_async_comp_init();
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..a0b9c0c23ed2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	struct page *tce_mem = NULL;
 	__be64 *addr;
 
-	tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+	tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+			shift - PAGE_SHIFT);
 	if (!tce_mem) {
 		pr_err("Failed to allocate a TCE memory, level shift=%d\n",
 				shift);
@@ -48,6 +49,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
 	return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned int levels);
+
 static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 {
 	__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +61,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 
 	while (level) {
 		int n = (idx & mask) >> (level * shift);
-		unsigned long tce;
+		unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
 
-		if (tmp[n] == 0) {
+		if (!tce) {
 			__be64 *tmp2;
 
 			if (!alloc)
@@ -70,10 +74,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
 			if (!tmp2)
 				return NULL;
 
-			tmp[n] = cpu_to_be64(__pa(tmp2) |
-					TCE_PCI_READ | TCE_PCI_WRITE);
+			tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+			oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+					cpu_to_be64(tce)));
+			if (oldtce) {
+				pnv_pci_ioda2_table_do_free_pages(tmp2,
+					ilog2(tbl->it_level_size) + 3, 1);
+				tce = oldtce;
+			}
 		}
-		tce = be64_to_cpu(tmp[n]);
 
 		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
 		idx &= ~mask;
@@ -161,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 		if (ptce)
 			*ptce = cpu_to_be64(0);
+		else
+			/* Skip the rest of the level */
+			i |= tbl->it_level_size - 1;
 	}
 }
 
@@ -260,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
 			PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
-	unsigned int tmplevels = levels;
 
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
@@ -268,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
-	if (alloc_userspace_copy && (window_size > (1ULL << 32)))
-		tmplevels = 1;
-
 	/* Adjust direct table size from window_size and levels */
 	entries_shift = (entries_shift + levels - 1) / levels;
 	level_shift = entries_shift + 3;
@@ -281,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	/* Allocate TCE table */
 	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-			tmplevels, tce_table_size, &offset, &total_allocated);
+			1, tce_table_size, &offset, &total_allocated);
 
 	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
@@ -292,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	 * we did not allocate as much as we wanted,
 	 * release partially allocated table.
 	 */
-	if (tmplevels == levels && offset < tce_table_size)
+	if (levels == 1 && offset < tce_table_size)
 		goto free_tces_exit;
 
 	/* Allocate userspace view of the TCE table */
 	if (alloc_userspace_copy) {
 		offset = 0;
 		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-				tmplevels, tce_table_size, &offset,
+				1, tce_table_size, &offset,
 				&total_allocated_uas);
 		if (!uas)
 			goto free_tces_exit;
-		if (tmplevels == levels && (offset < tce_table_size ||
+		if (levels == 1 && (offset < tce_table_size ||
 				total_allocated_uas != total_allocated))
 			goto free_uas_exit;
 	}
@@ -318,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 
 	pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
 			window_size, tce_table_size, bus_offset, tbl->it_base,
-			tbl->it_userspace, tmplevels, levels);
+			tbl->it_userspace, 1, levels);
 
 	return 0;
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 10cc42b9e541..c28d0d9b7ee0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -50,6 +50,8 @@
 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
 					      "NPU_OCAPI" };
 
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -1937,26 +1939,12 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
 }
 
 #ifdef CONFIG_IOMMU_API
-static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
+/* Common for IODA1 and IODA2 */
+static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction,
+		bool realmode)
 {
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
-	if (!ret)
-		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
-
-	return ret;
-}
-
-static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
-	if (!ret)
-		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
-
-	return ret;
+	return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
 }
 #endif
 
@@ -1971,8 +1959,8 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
 #ifdef CONFIG_IOMMU_API
-	.exchange = pnv_ioda1_tce_xchg,
-	.exchange_rm = pnv_ioda1_tce_xchg_rm,
+	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+	.tce_kill = pnv_pci_p7ioc_tce_invalidate,
 	.useraddrptr = pnv_tce_useraddrptr,
 #endif
 	.clear = pnv_ioda1_tce_free,
@@ -2101,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 	return ret;
 }
 
-#ifdef CONFIG_IOMMU_API
-static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
-	if (!ret)
-		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
-
-	return ret;
-}
-
-static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
-		unsigned long *hpa, enum dma_data_direction *direction)
-{
-	long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
-	if (!ret)
-		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
-
-	return ret;
-}
-#endif
-
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 		long npages)
 {
@@ -2136,8 +2100,8 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
-	.exchange = pnv_ioda2_tce_xchg,
-	.exchange_rm = pnv_ioda2_tce_xchg_rm,
+	.xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+	.tce_kill = pnv_pci_ioda2_tce_invalidate,
 	.useraddrptr = pnv_tce_useraddrptr,
 #endif
 	.clear = pnv_ioda2_tce_free,
@@ -2301,7 +2265,7 @@ found:
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
 	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
-	iommu_init_table(tbl, phb->hose->node);
+	iommu_init_table(tbl, phb->hose->node, 0, 0);
 
 	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
@@ -2356,7 +2320,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	return 0;
 }
 
-void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
@@ -2418,6 +2382,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 {
 	struct iommu_table *tbl = NULL;
 	long rc;
+	unsigned long res_start, res_end;
 
 	/*
 	 * crashkernel= specifies the kdump kernel's maximum memory at
@@ -2431,19 +2396,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	 * DMA window can be larger than available memory, which will
 	 * cause errors later.
 	 */
-	const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
 
-	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
-			IOMMU_PAGE_SHIFT_4K,
-			window_size,
-			POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
+	/*
+	 * We create the default window as big as we can. The constraint is
+	 * the max order of allocation possible. The TCE table is likely to
+	 * end up being multilevel and with on-demand allocation in place,
+	 * the initial use is not going to be huge as the default window aims
+	 * to support crippled devices (i.e. not fully 64bit DMAble) only.
+	 */
+	/* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
+	const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
+	/* Each TCE level cannot exceed maxblock so go multilevel if needed */
+	unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
+	unsigned long tcelevel_order = ilog2(maxblock >> 3);
+	unsigned int levels = tces_order / tcelevel_order;
+
+	if (tces_order % tcelevel_order)
+		levels += 1;
+	/*
+	 * We try to stick to default levels (which is >1 at the moment) in
+	 * order to save memory by relying on on-demain TCE level allocation.
+	 */
+	levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
+			window_size, levels, false, &tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
 				rc);
 		return rc;
 	}
 
-	iommu_init_table(tbl, pe->phb->hose->node);
+	/* We use top part of 32bit space for MMIO so exclude it from DMA */
+	res_start = 0;
+	res_end = 0;
+	if (window_size > pe->phb->ioda.m32_pci_base) {
+		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
+		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
+	}
+	iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
 
 	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
 	if (rc) {
@@ -2456,6 +2448,14 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (!pnv_iommu_bypass_disabled)
 		pnv_pci_ioda2_set_bypass(pe, true);
 
+	/*
+	 * Set table base for the case of IOMMU DMA use. Usually this is done
+	 * from dma_dev_setup() which is not called when a device is returned
+	 * from VFIO so do it here.
+	 */
+	if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+
 	return 0;
 }
 
@@ -2543,6 +2543,8 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
+	else if (pe->pdev)
+		set_iommu_table_base(&pe->pdev->dev, NULL);
 	iommu_tce_table_put(tbl);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index ff1a33fee8e6..2825d004dece 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -34,7 +34,6 @@
 #include "powernv.h"
 #include "pci.h"
 
-static DEFINE_MUTEX(p2p_mutex);
 static DEFINE_MUTEX(tunnel_mutex);
 
 int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
@@ -55,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
 			break;
 		}
 
-		if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) {
+		if (!of_device_is_compatible(parent, "ibm,ioda2-phb") &&
+		    !of_device_is_compatible(parent, "ibm,ioda3-phb")) {
 			of_node_put(parent);
 			continue;
 		}
@@ -857,79 +857,6 @@ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
 	}
 }
 
-int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc)
-{
-	struct pci_controller *hose;
-	struct pnv_phb *phb_init, *phb_target;
-	struct pnv_ioda_pe *pe_init;
-	int rc;
-
-	if (!opal_check_token(OPAL_PCI_SET_P2P))
-		return -ENXIO;
-
-	hose = pci_bus_to_host(initiator->bus);
-	phb_init = hose->private_data;
-
-	hose = pci_bus_to_host(target->bus);
-	phb_target = hose->private_data;
-
-	pe_init = pnv_ioda_get_pe(initiator);
-	if (!pe_init)
-		return -ENODEV;
-
-	/*
-	 * Configuring the initiator's PHB requires to adjust its
-	 * TVE#1 setting. Since the same device can be an initiator
-	 * several times for different target devices, we need to keep
-	 * a reference count to know when we can restore the default
-	 * bypass setting on its TVE#1 when disabling. Opal is not
-	 * tracking PE states, so we add a reference count on the PE
-	 * in linux.
-	 *
-	 * For the target, the configuration is per PHB, so we keep a
-	 * target reference count on the PHB.
-	 */
-	mutex_lock(&p2p_mutex);
-
-	if (desc & OPAL_PCI_P2P_ENABLE) {
-		/* always go to opal to validate the configuration */
-		rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id,
-				      desc, pe_init->pe_number);
-
-		if (rc != OPAL_SUCCESS) {
-			rc = -EIO;
-			goto out;
-		}
-
-		pe_init->p2p_initiator_count++;
-		phb_target->p2p_target_count++;
-	} else {
-		if (!pe_init->p2p_initiator_count ||
-			!phb_target->p2p_target_count) {
-			rc = -EINVAL;
-			goto out;
-		}
-
-		if (--pe_init->p2p_initiator_count == 0)
-			pnv_pci_ioda2_set_bypass(pe_init, true);
-
-		if (--phb_target->p2p_target_count == 0) {
-			rc = opal_pci_set_p2p(phb_init->opal_id,
-					      phb_target->opal_id, desc,
-					      pe_init->pe_number);
-			if (rc != OPAL_SUCCESS) {
-				rc = -EIO;
-				goto out;
-			}
-		}
-	}
-	rc = 0;
-out:
-	mutex_unlock(&p2p_mutex);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_set_p2p);
-
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
@@ -938,54 +865,6 @@ struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pnv_pci_get_phb_node);
 
-int pnv_pci_enable_tunnel(struct pci_dev *dev, u64 *asnind)
-{
-	struct device_node *np;
-	const __be32 *prop;
-	struct pnv_ioda_pe *pe;
-	uint16_t window_id;
-	int rc;
-
-	if (!radix_enabled())
-		return -ENXIO;
-
-	if (!(np = pnv_pci_get_phb_node(dev)))
-		return -ENXIO;
-
-	prop = of_get_property(np, "ibm,phb-indications", NULL);
-	of_node_put(np);
-
-	if (!prop || !prop[1])
-		return -ENXIO;
-
-	*asnind = (u64)be32_to_cpu(prop[1]);
-	pe = pnv_ioda_get_pe(dev);
-	if (!pe)
-		return -ENODEV;
-
-	/* Increase real window size to accept as_notify messages. */
-	window_id = (pe->pe_number << 1 ) + 1;
-	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, pe->pe_number,
-					     window_id, pe->tce_bypass_base,
-					     (uint64_t)1 << 48);
-	return opal_error_code(rc);
-}
-EXPORT_SYMBOL_GPL(pnv_pci_enable_tunnel);
-
-int pnv_pci_disable_tunnel(struct pci_dev *dev)
-{
-	struct pnv_ioda_pe *pe;
-
-	pe = pnv_ioda_get_pe(dev);
-	if (!pe)
-		return -ENODEV;
-
-	/* Restore default real window size. */
-	pnv_pci_ioda2_set_bypass(pe, true);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_disable_tunnel);
-
 int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
 {
 	__be64 val;
@@ -1040,29 +919,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
 
-#ifdef CONFIG_PPC64	/* for thread.tidr */
-int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid,
-			       u32 *tid)
-{
-	struct mm_struct *mm = NULL;
-
-	if (task == NULL)
-		return -EINVAL;
-
-	mm = get_task_mm(task);
-	if (mm == NULL)
-		return -EINVAL;
-
-	*pid = mm->context.id;
-	mmput(mm);
-
-	*tid = task->thread.tidr;
-	*lpid = mfspr(SPRN_LPID);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info);
-#endif
-
 void pnv_pci_shutdown(void)
 {
 	struct pci_controller *hose;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index be26ab3d99e0..f914f0b14e4e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -79,9 +79,6 @@ struct pnv_ioda_pe {
 	struct pnv_ioda_pe	*master;
 	struct list_head	slaves;
 
-	/* PCI peer-to-peer*/
-	int			p2p_initiator_count;
-
 	/* Link in list of PE#s */
 	struct list_head	list;
 };
@@ -172,8 +169,6 @@ struct pnv_phb {
 	/* PHB and hub diagnostics */
 	unsigned int		diag_data_size;
 	u8			*diag_data;
-
-	int p2p_target_count;
 };
 
 extern struct pci_ops pnv_pci_ops;
@@ -200,7 +195,6 @@ extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
-extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
 extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
@@ -225,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach(
 		struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
-#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_DEFAULT_LEVELS	2
 #define POWERNV_IOMMU_MAX_LEVELS	5
 
 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index fd4a1c5a6369..1aa51c4fa904 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -30,4 +30,9 @@ extern void opal_event_shutdown(void);
 
 bool cpu_core_split_required(void);
 
+struct memcons;
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
+u32 memcons_get_size(struct memcons *mc);
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name);
+
 #endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index a5e52f9eed3c..83498604d322 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -24,6 +24,7 @@
 #include <linux/bug.h>
 #include <linux/pci.h>
 #include <linux/cpufreq.h>
+#include <linux/memblock.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
@@ -166,6 +167,14 @@ static void __init pnv_init(void)
 	else
 #endif
 		add_preferred_console("hvc", 0, NULL);
+
+	if (!radix_enabled()) {
+		int i;
+
+		/* Allocate per cpu area to save old slb contents during MCE */
+		for_each_possible_cpu(i)
+			paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i));
+	}
 }
 
 static void __init pnv_init_IRQ(void)
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
new file mode 100644
index 000000000000..e4a00ad06f9d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ultravisor high level interfaces
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/of_fdt.h>
+#include <linux/of.h>
+
+#include <asm/ultravisor.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static struct kobject *ultravisor_kobj;
+
+int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+					 int depth, void *data)
+{
+	if (!of_flat_dt_is_compatible(node, "ibm,ultravisor"))
+		return 0;
+
+	powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR;
+	pr_debug("Ultravisor detected!\n");
+	return 1;
+}
+
+static struct memcons *uv_memcons;
+
+static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj,
+			      struct bin_attribute *bin_attr, char *to,
+			      loff_t pos, size_t count)
+{
+	return memcons_copy(uv_memcons, to, pos, count);
+}
+
+static struct bin_attribute uv_msglog_attr = {
+	.attr = {.name = "msglog", .mode = 0400},
+	.read = uv_msglog_read
+};
+
+static int __init uv_init(void)
+{
+	struct device_node *node;
+
+	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+		return 0;
+
+	node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+	if (!node)
+		return -ENODEV;
+
+	uv_memcons = memcons_init(node, "memcons");
+	if (!uv_memcons)
+		return -ENOENT;
+
+	uv_msglog_attr.size = memcons_get_size(uv_memcons);
+
+	ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj);
+	if (!ultravisor_kobj)
+		return -ENOMEM;
+
+	return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr);
+}
+machine_subsys_initcall(powernv, uv_init);
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index ea5ca0201da8..0c0d27d17976 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -40,16 +40,6 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
 	pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
-u64 vas_win_paste_addr(struct vas_window *win)
-{
-	u64 addr;
-
-	compute_paste_address(win, &addr, NULL);
-
-	return addr;
-}
-EXPORT_SYMBOL(vas_win_paste_addr);
-
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
 			u64 *start, int *len)
 {
@@ -1264,12 +1254,3 @@ int vas_win_close(struct vas_window *window)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
-
-/*
- * Return a system-wide unique window id for the window @win.
- */
-u32 vas_win_id(struct vas_window *win)
-{
-	return encode_pswid(win->vinst->vas_id, win->winid);
-}
-EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 9cc5251816db..5574aec9ee88 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -444,26 +444,6 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
 	return in_be64(win->hvwc_map+reg);
 }
 
-/*
- * Encode/decode the Partition Send Window ID (PSWID) for a window in
- * a way that we can uniquely identify any window in the system. i.e.
- * we should be able to locate the 'struct vas_window' given the PSWID.
- *
- *	Bits	Usage
- *	0:7	VAS id (8 bits)
- *	8:15	Unused, 0 (3 bits)
- *	16:31	Window id (16 bits)
- */
-static inline u32 encode_pswid(int vasid, int winid)
-{
-	u32 pswid = 0;
-
-	pswid |= vasid << (31 - 7);
-	pswid |= winid;
-
-	return pswid;
-}
-
 static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
 {
 	if (vasid)
diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c
index bdaeaecdc06b..1193c294b8d0 100644
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu)
  * setup_areas - Map the spu regions into the address space.
  *
  * The current HV requires the spu shadow regs to be mapped with the
- * PTE page protection bits set as read-only (PP=3).  This implementation
- * uses the low level __ioremap() to bypass the page protection settings
- * inforced by ioremap_prot() to get the needed PTE bits set for the
- * shadow regs.
+ * PTE page protection bits set as read-only.
  */
 
 static int __init setup_areas(struct spu *spu)
@@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu)
 	struct table {char* name; unsigned long addr; unsigned long size;};
 	unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
 
-	spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr,
-					   sizeof(struct spe_shadow),
-					   shadow_flags);
+	spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr,
+					      sizeof(struct spe_shadow), shadow_flags);
 	if (!spu_pdata(spu)->shadow) {
 		pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__);
 		goto fail_ioremap;
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index 98410119c47b..3542b7bd6a46 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -686,20 +686,16 @@ static int ps3_dma_supported(struct device *_dev, u64 mask)
 	return mask >= DMA_BIT_MASK(32);
 }
 
-static u64 ps3_dma_get_required_mask(struct device *_dev)
-{
-	return DMA_BIT_MASK(32);
-}
-
 static const struct dma_map_ops ps3_sb_dma_ops = {
 	.alloc = ps3_alloc_coherent,
 	.free = ps3_free_coherent,
 	.map_sg = ps3_sb_map_sg,
 	.unmap_sg = ps3_sb_unmap_sg,
 	.dma_supported = ps3_dma_supported,
-	.get_required_mask = ps3_dma_get_required_mask,
 	.map_page = ps3_sb_map_page,
 	.unmap_page = ps3_unmap_page,
+	.mmap = dma_common_mmap,
+	.get_sgtable = dma_common_get_sgtable,
 };
 
 static const struct dma_map_ops ps3_ioc0_dma_ops = {
@@ -708,9 +704,10 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = {
 	.map_sg = ps3_ioc0_map_sg,
 	.unmap_sg = ps3_ioc0_unmap_sg,
 	.dma_supported = ps3_dma_supported,
-	.get_required_mask = ps3_dma_get_required_mask,
 	.map_page = ps3_ioc0_map_page,
 	.unmap_page = ps3_unmap_page,
+	.mmap = dma_common_mmap,
+	.get_sgtable = dma_common_get_sgtable,
 };
 
 /**
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 9c6b3d860518..9e35cddddf73 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -23,6 +23,7 @@ config PPC_PSERIES
 	select ARCH_RANDOM
 	select PPC_DOORBELL
 	select FORCE_SMP
+	select SWIOTLB
 	default y
 
 config PPC_SPLPAR
@@ -80,19 +81,19 @@ config LPARCFG
 	bool "LPAR Configuration Data"
 	depends on PPC_PSERIES
 	help
-	Provide system capacity information via human readable
-	<key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
+	  Provide system capacity information via human readable
+	  <key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
 
 config PPC_PSERIES_DEBUG
 	depends on PPC_PSERIES && PPC_EARLY_DEBUG
 	bool "Enable extra debug logging in platforms/pseries"
-        help
+	default y
+	help
 	  Say Y here if you want the pseries core to produce a bunch of
 	  debug messages to the system log. Select this if you are having a
 	  problem with the pseries core and want to see more of what is
 	  going on. This does not enable debugging in lpar.c, which must
 	  be manually done due to its verbosity.
-	default y
 
 config PPC_SMLPAR
 	bool "Support for shared-memory logical partitions"
@@ -117,16 +118,16 @@ config CMM
 	  balance memory across many LPARs.
 
 config HV_PERF_CTRS
-       bool "Hypervisor supplied PMU events (24x7 & GPCI)"
-       default y
-       depends on PERF_EVENTS && PPC_PSERIES
-       help
+	bool "Hypervisor supplied PMU events (24x7 & GPCI)"
+	default y
+	depends on PERF_EVENTS && PPC_PSERIES
+	help
 	  Enable access to hypervisor supplied counters in perf. Currently,
 	  this enables code that uses the hcall GetPerfCounterInfo and 24x7
 	  interfaces to retrieve counters. GPCI exists on Power 6 and later
 	  systems. 24x7 is available on Power 8 and later systems.
 
-          If unsure, select Y.
+	  If unsure, select Y.
 
 config IBMVIO
 	depends on PPC_PSERIES
@@ -144,3 +145,17 @@ config PAPR_SCM
 	tristate "Support for the PAPR Storage Class Memory interface"
 	help
 	  Enable access to hypervisor provided storage class memory.
+
+config PPC_SVM
+	bool "Secure virtual machine (SVM) support for POWER"
+	depends on PPC_PSERIES
+	select SWIOTLB
+	select ARCH_HAS_MEM_ENCRYPT
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+	help
+	 There are certain POWER platforms which support secure guests using
+	 the Protected Execution Facility, with the help of an Ultravisor
+	 executing below the hypervisor layer. This enables support for
+	 those guests.
+
+	 If unsure, say "N".
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index a43ec843c8e2..a3c74a5cf20d 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -25,6 +25,9 @@ obj-$(CONFIG_LPARCFG)		+= lparcfg.o
 obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)		+= ibmebus.o
 obj-$(CONFIG_PAPR_SCM)		+= papr_scm.o
+obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
+obj-$(CONFIG_PPC_SVM)		+= svm.o
+obj-$(CONFIG_FA_DUMP)		+= rtas-fadump.o
 
 ifdef CONFIG_PPC_PSERIES
 obj-$(CONFIG_SUSPEND)		+= suspend.o
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 437a74173db2..16e86ba8aa20 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -58,6 +58,10 @@ static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
 
 	name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
 	prop->name = kstrdup(name, GFP_KERNEL);
+	if (!prop->name) {
+		dlpar_free_cc_property(prop);
+		return NULL;
+	}
 
 	prop->length = be32_to_cpu(ccwa->prop_length);
 	value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset);
@@ -383,11 +387,11 @@ void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog)
 	struct pseries_hp_work *work;
 	struct pseries_hp_errorlog *hp_errlog_copy;
 
-	hp_errlog_copy = kmalloc(sizeof(struct pseries_hp_errorlog),
-				 GFP_KERNEL);
-	memcpy(hp_errlog_copy, hp_errlog, sizeof(struct pseries_hp_errorlog));
+	hp_errlog_copy = kmemdup(hp_errlog, sizeof(*hp_errlog), GFP_ATOMIC);
+	if (!hp_errlog_copy)
+		return;
 
-	work = kmalloc(sizeof(struct pseries_hp_work), GFP_KERNEL);
+	work = kmalloc(sizeof(struct pseries_hp_work), GFP_ATOMIC);
 	if (work) {
 		INIT_WORK((struct work_struct *)work, pseries_hp_work_fn);
 		work->errlog = hp_errlog_copy;
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index ab5de985a787..2b87480f2837 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -27,13 +27,7 @@ struct dtl {
 };
 static DEFINE_PER_CPU(struct dtl, cpu_dtl);
 
-/*
- * Dispatch trace log event mask:
- * 0x7: 0x1: voluntary virtual processor waits
- *      0x2: time-slice preempts
- *      0x4: virtual partition memory page faults
- */
-static u8 dtl_event_mask = 0x7;
+static u8 dtl_event_mask = DTL_LOG_ALL;
 
 
 /*
@@ -48,7 +42,6 @@ struct dtl_ring {
 	struct dtl_entry *write_ptr;
 	struct dtl_entry *buf;
 	struct dtl_entry *buf_end;
-	u8	saved_dtl_mask;
 };
 
 static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
@@ -98,7 +91,6 @@ static int dtl_start(struct dtl *dtl)
 	dtlr->write_ptr = dtl->buf;
 
 	/* enable event logging */
-	dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
 	lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
 
 	dtl_consumer = consume_dtle;
@@ -116,7 +108,7 @@ static void dtl_stop(struct dtl *dtl)
 	dtlr->buf = NULL;
 
 	/* restore dtl_enable_mask */
-	lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
+	lppaca_of(dtl->cpu).dtl_enable_mask = DTL_LOG_PREEMPT;
 
 	if (atomic_dec_and_test(&dtl_count))
 		dtl_consumer = NULL;
@@ -188,11 +180,16 @@ static int dtl_enable(struct dtl *dtl)
 	if (dtl->buf)
 		return -EBUSY;
 
+	/* ensure there are no other conflicting dtl users */
+	if (!read_trylock(&dtl_access_lock))
+		return -EBUSY;
+
 	n_entries = dtl_buf_entries;
 	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
 	if (!buf) {
 		printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
 				__func__, dtl->cpu);
+		read_unlock(&dtl_access_lock);
 		return -ENOMEM;
 	}
 
@@ -209,8 +206,11 @@ static int dtl_enable(struct dtl *dtl)
 	}
 	spin_unlock(&dtl->lock);
 
-	if (rc)
+	if (rc) {
+		read_unlock(&dtl_access_lock);
 		kmem_cache_free(dtl_cache, buf);
+	}
+
 	return rc;
 }
 
@@ -222,6 +222,7 @@ static void dtl_disable(struct dtl *dtl)
 	dtl->buf = NULL;
 	dtl->buf_entries = 0;
 	spin_unlock(&dtl->lock);
+	read_unlock(&dtl_access_lock);
 }
 
 /* file interface */
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 9edae1863e2f..893ba3f562c4 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -42,42 +42,44 @@ static int ibm_get_config_addr_info;
 static int ibm_get_config_addr_info2;
 static int ibm_configure_pe;
 
-#ifdef CONFIG_PCI_IOV
 void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 {
 	struct pci_dn *pdn = pci_get_pdn(pdev);
-	struct pci_dn *physfn_pdn;
-	struct eeh_dev *edev;
 
-	if (!pdev->is_virtfn)
+	if (eeh_has_flag(EEH_FORCE_DISABLED))
 		return;
 
-	pdn->device_id  =  pdev->device;
-	pdn->vendor_id  =  pdev->vendor;
-	pdn->class_code =  pdev->class;
-	/*
-	 * Last allow unfreeze return code used for retrieval
-	 * by user space in eeh-sysfs to show the last command
-	 * completion from platform.
-	 */
-	pdn->last_allow_rc =  0;
-	physfn_pdn      =  pci_get_pdn(pdev->physfn);
-	pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
-	edev = pdn_to_eeh_dev(pdn);
+	dev_dbg(&pdev->dev, "EEH: Setting up device\n");
+#ifdef CONFIG_PCI_IOV
+	if (pdev->is_virtfn) {
+		struct pci_dn *physfn_pdn;
 
-	/*
-	 * The following operations will fail if VF's sysfs files
-	 * aren't created or its resources aren't finalized.
-	 */
+		pdn->device_id  =  pdev->device;
+		pdn->vendor_id  =  pdev->vendor;
+		pdn->class_code =  pdev->class;
+		/*
+		 * Last allow unfreeze return code used for retrieval
+		 * by user space in eeh-sysfs to show the last command
+		 * completion from platform.
+		 */
+		pdn->last_allow_rc =  0;
+		physfn_pdn      =  pci_get_pdn(pdev->physfn);
+		pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
+	}
+#endif
 	eeh_add_device_early(pdn);
 	eeh_add_device_late(pdev);
-	edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
-	eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
-	eeh_add_to_parent_pe(edev);   /* Add as VF PE type */
-	eeh_sysfs_add_device(pdev);
+#ifdef CONFIG_PCI_IOV
+	if (pdev->is_virtfn) {
+		struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
-}
+		edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
+		eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
+		eeh_add_to_parent_pe(edev);   /* Add as VF PE type */
+	}
 #endif
+	eeh_sysfs_add_device(pdev);
+}
 
 /*
  * Buffer for reporting slot-error-detail rtas calls. Its here
@@ -144,10 +146,8 @@ static int pseries_eeh_init(void)
 	/* Set EEH probe mode */
 	eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
 
-#ifdef CONFIG_PCI_IOV
 	/* Set EEH machine dependent code */
 	ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
-#endif
 
 	return 0;
 }
@@ -251,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
         if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
 		return NULL;
 
+	eeh_edev_dbg(edev, "Probing device\n");
+
 	/*
 	 * Update class code and mode of eeh device. We need
 	 * correctly reflects that current device is root port
@@ -280,8 +282,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 	pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
 
 	/* Enable EEH on the device */
+	eeh_edev_dbg(edev, "Enabling EEH on device\n");
 	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
-	if (!ret) {
+	if (ret) {
+		eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
+	} else {
 		/* Retrieve PE address */
 		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
 		pe.addr = edev->pe_config_addr;
@@ -297,11 +302,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 		if (enable) {
 			eeh_add_flag(EEH_ENABLED);
 			eeh_add_to_parent_pe(edev);
-
-			pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
-				__func__, pdn->busno, PCI_SLOT(pdn->devfn),
-				PCI_FUNC(pdn->devfn), pe.phb->global_number,
-				pe.addr);
 		} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
 			   (pdn_to_eeh_dev(pdn->parent))->pe) {
 			/* This device doesn't support EEH, but it may have an
@@ -310,6 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
 			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
 			eeh_add_to_parent_pe(edev);
 		}
+		eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
+			     (enable ? "enabled" : "unsupported"), ret);
 	}
 
 	/* Save memory bars */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2ec43b4639a0..8e700390f3d6 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
 
 	switch (hp_elog->action) {
 	case PSERIES_HP_ELOG_ACTION_ADD:
-		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+		switch (hp_elog->id_type) {
+		case PSERIES_HP_ELOG_ID_DRC_COUNT:
 			count = hp_elog->_drc_u.drc_count;
 			rc = dlpar_memory_add_by_count(count);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_INDEX:
 			drc_index = hp_elog->_drc_u.drc_index;
 			rc = dlpar_memory_add_by_index(drc_index);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_IC:
 			count = hp_elog->_drc_u.ic.count;
 			drc_index = hp_elog->_drc_u.ic.index;
 			rc = dlpar_memory_add_by_ic(count, drc_index);
-		} else {
+			break;
+		default:
 			rc = -EINVAL;
+			break;
 		}
 
 		break;
 	case PSERIES_HP_ELOG_ACTION_REMOVE:
-		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+		switch (hp_elog->id_type) {
+		case PSERIES_HP_ELOG_ID_DRC_COUNT:
 			count = hp_elog->_drc_u.drc_count;
 			rc = dlpar_memory_remove_by_count(count);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_INDEX:
 			drc_index = hp_elog->_drc_u.drc_index;
 			rc = dlpar_memory_remove_by_index(drc_index);
-		} else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+			break;
+		case PSERIES_HP_ELOG_ID_DRC_IC:
 			count = hp_elog->_drc_u.ic.count;
 			drc_index = hp_elog->_drc_u.ic.index;
 			rc = dlpar_memory_remove_by_ic(count, drc_index);
-		} else {
+			break;
+		default:
 			rc = -EINVAL;
+			break;
 		}
 
 		break;
@@ -976,6 +986,9 @@ static int pseries_update_drconf_memory(struct of_reconfig_data *pr)
 	if (!memblock_size)
 		return -EINVAL;
 
+	if (!pr->old_prop)
+		return 0;
+
 	p = (__be32 *) pr->old_prop->value;
 	if (!p)
 		return -EINVAL;
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
index 1498c6b989e6..1ac52963e08b 100644
--- a/arch/powerpc/platforms/pseries/hvconsole.c
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -49,7 +49,7 @@ EXPORT_SYMBOL(hvc_get_chars);
  * @vtermno: The vtermno or unit_address of the adapter from which the data
  *	originated.
  * @buf: The character buffer that contains the character data to send to
- *	firmware.
+ *	firmware. Must be at least 16 bytes, even if count is less than 16.
  * @count: Send this number of characters.
  */
 int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index 84e8ec4011ba..b91eb0929ed1 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -147,13 +147,13 @@ static const struct dma_map_ops ibmebus_dma_ops = {
 	.unmap_page         = ibmebus_unmap_page,
 };
 
-static int ibmebus_match_path(struct device *dev, void *data)
+static int ibmebus_match_path(struct device *dev, const void *data)
 {
 	struct device_node *dn = to_platform_device(dev)->dev.of_node;
 	return (of_find_node_by_path(data) == dn);
 }
 
-static int ibmebus_match_node(struct device *dev, void *data)
+static int ibmebus_match_node(struct device *dev, const void *data)
 {
 	return to_platform_device(dev)->dev.of_node == data;
 }
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 889dc2e44b89..6ba081dd61c9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include <asm/udbg.h>
 #include <asm/mmzone.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 
@@ -609,7 +610,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
-	iommu_init_table(tbl, pci->phb->node);
+	iommu_init_table(tbl, pci->phb->node, 0, 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -621,7 +622,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 #ifdef CONFIG_IOMMU_API
 static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
-				long *tce, enum dma_data_direction *direction)
+				long *tce, enum dma_data_direction *direction,
+				bool realmode)
 {
 	long rc;
 	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
@@ -649,7 +651,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
 struct iommu_table_ops iommu_table_lpar_multi_ops = {
 	.set = tce_buildmulti_pSeriesLP,
 #ifdef CONFIG_IOMMU_API
-	.exchange = tce_exchange_pseries,
+	.xchg_no_kill = tce_exchange_pseries,
 #endif
 	.clear = tce_freemulti_pSeriesLP,
 	.get = tce_get_pSeriesLP
@@ -690,7 +692,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
 				ppci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		iommu_init_table(tbl, ppci->phb->node);
+		iommu_init_table(tbl, ppci->phb->node, 0, 0);
 		iommu_register_group(ppci->table_group,
 				pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->table_group);
@@ -719,7 +721,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
-		iommu_init_table(tbl, phb->node);
+		iommu_init_table(tbl, phb->node, 0, 0);
 		set_iommu_table_base(&dev->dev, tbl);
 		return;
 	}
@@ -1169,7 +1171,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
 				pci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		iommu_init_table(tbl, pci->phb->node);
+		iommu_init_table(tbl, pci->phb->node, 0, 0);
 		iommu_register_group(pci->table_group,
 				pci_domain_nr(pci->phb->bus), 0);
 		pr_debug("  created table: %p\n", pci->table_group);
@@ -1318,7 +1320,15 @@ void iommu_init_early_pSeries(void)
 	of_reconfig_notifier_register(&iommu_reconfig_nb);
 	register_memory_notifier(&iommu_mem_nb);
 
-	set_pci_dma_ops(&dma_iommu_ops);
+	/*
+	 * Secure guest memory is inacessible to devices so regular DMA isn't
+	 * possible.
+	 *
+	 * In that case keep devices' dma_map_ops as NULL so that the generic
+	 * DMA code path will use SWIOTLB to bounce buffers for DMA.
+	 */
+	if (!is_secure_guest())
+		set_pci_dma_ops(&dma_iommu_ops);
 }
 
 static int __init disable_multitce(char *str)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 73620dfb63a1..36b846f6e74e 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -17,6 +17,10 @@
 #include <linux/jump_label.h>
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
+#include <linux/spinlock.h>
+#include <linux/cpuhotplug.h>
+#include <linux/workqueue.h>
+#include <linux/proc_fs.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -52,13 +56,591 @@ EXPORT_SYMBOL(plpar_hcall);
 EXPORT_SYMBOL(plpar_hcall9);
 EXPORT_SYMBOL(plpar_hcall_norets);
 
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+static u8 dtl_mask = DTL_LOG_PREEMPT;
+#else
+static u8 dtl_mask;
+#endif
+
+void alloc_dtl_buffers(unsigned long *time_limit)
+{
+	int cpu;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+
+	for_each_possible_cpu(cpu) {
+		pp = paca_ptrs[cpu];
+		if (pp->dispatch_log)
+			continue;
+		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
+		if (!dtl) {
+			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
+				cpu);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+			pr_warn("Stolen time statistics will be unreliable\n");
+#endif
+			break;
+		}
+
+		pp->dtl_ridx = 0;
+		pp->dispatch_log = dtl;
+		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
+		pp->dtl_curr = dtl;
+
+		if (time_limit && time_after(jiffies, *time_limit)) {
+			cond_resched();
+			*time_limit = jiffies + HZ;
+		}
+	}
+}
+
+void register_dtl_buffer(int cpu)
+{
+	long ret;
+	struct paca_struct *pp;
+	struct dtl_entry *dtl;
+	int hwcpu = get_hard_smp_processor_id(cpu);
+
+	pp = paca_ptrs[cpu];
+	dtl = pp->dispatch_log;
+	if (dtl && dtl_mask) {
+		pp->dtl_ridx = 0;
+		pp->dtl_curr = dtl;
+		lppaca_of(cpu).dtl_idx = 0;
+
+		/* hypervisor reads buffer length from this field */
+		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
+		ret = register_dtl(hwcpu, __pa(dtl));
+		if (ret)
+			pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
+			       cpu, hwcpu, ret);
+
+		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
+	}
+}
+
+#ifdef CONFIG_PPC_SPLPAR
+struct dtl_worker {
+	struct delayed_work work;
+	int cpu;
+};
+
+struct vcpu_dispatch_data {
+	int last_disp_cpu;
+
+	int total_disp;
+
+	int same_cpu_disp;
+	int same_chip_disp;
+	int diff_chip_disp;
+	int far_chip_disp;
+
+	int numa_home_disp;
+	int numa_remote_disp;
+	int numa_far_disp;
+};
+
+/*
+ * This represents the number of cpus in the hypervisor. Since there is no
+ * architected way to discover the number of processors in the host, we
+ * provision for dealing with NR_CPUS. This is currently 2048 by default, and
+ * is sufficient for our purposes. This will need to be tweaked if
+ * CONFIG_NR_CPUS is changed.
+ */
+#define NR_CPUS_H	NR_CPUS
+
+DEFINE_RWLOCK(dtl_access_lock);
+static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
+static DEFINE_PER_CPU(u64, dtl_entry_ridx);
+static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
+static enum cpuhp_state dtl_worker_state;
+static DEFINE_MUTEX(dtl_enable_mutex);
+static int vcpudispatch_stats_on __read_mostly;
+static int vcpudispatch_stats_freq = 50;
+static __be32 *vcpu_associativity, *pcpu_associativity;
+
+
+static void free_dtl_buffers(unsigned long *time_limit)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	int cpu;
+	struct paca_struct *pp;
+
+	for_each_possible_cpu(cpu) {
+		pp = paca_ptrs[cpu];
+		if (!pp->dispatch_log)
+			continue;
+		kmem_cache_free(dtl_cache, pp->dispatch_log);
+		pp->dtl_ridx = 0;
+		pp->dispatch_log = 0;
+		pp->dispatch_log_end = 0;
+		pp->dtl_curr = 0;
+
+		if (time_limit && time_after(jiffies, *time_limit)) {
+			cond_resched();
+			*time_limit = jiffies + HZ;
+		}
+	}
+#endif
+}
+
+static int init_cpu_associativity(void)
+{
+	vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
+			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
+	pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
+			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
+
+	if (!vcpu_associativity || !pcpu_associativity) {
+		pr_err("error allocating memory for associativity information\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void destroy_cpu_associativity(void)
+{
+	kfree(vcpu_associativity);
+	kfree(pcpu_associativity);
+	vcpu_associativity = pcpu_associativity = 0;
+}
+
+static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
+{
+	__be32 *assoc;
+	int rc = 0;
+
+	assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
+	if (!assoc[0]) {
+		rc = hcall_vphn(cpu, flag, &assoc[0]);
+		if (rc)
+			return NULL;
+	}
+
+	return assoc;
+}
+
+static __be32 *get_pcpu_associativity(int cpu)
+{
+	return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
+}
+
+static __be32 *get_vcpu_associativity(int cpu)
+{
+	return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
+}
+
+static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
+{
+	__be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
+
+	if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
+		return -EINVAL;
+
+	last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
+	cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
+
+	if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
+		return -EIO;
+
+	return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
+}
+
+static int cpu_home_node_dispatch_distance(int disp_cpu)
+{
+	__be32 *disp_cpu_assoc, *vcpu_assoc;
+	int vcpu_id = smp_processor_id();
+
+	if (disp_cpu >= NR_CPUS_H) {
+		pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
+						disp_cpu, NR_CPUS_H);
+		return -EINVAL;
+	}
+
+	disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
+	vcpu_assoc = get_vcpu_associativity(vcpu_id);
+
+	if (!disp_cpu_assoc || !vcpu_assoc)
+		return -EIO;
+
+	return cpu_distance(disp_cpu_assoc, vcpu_assoc);
+}
+
+static void update_vcpu_disp_stat(int disp_cpu)
+{
+	struct vcpu_dispatch_data *disp;
+	int distance;
+
+	disp = this_cpu_ptr(&vcpu_disp_data);
+	if (disp->last_disp_cpu == -1) {
+		disp->last_disp_cpu = disp_cpu;
+		return;
+	}
+
+	disp->total_disp++;
+
+	if (disp->last_disp_cpu == disp_cpu ||
+		(cpu_first_thread_sibling(disp->last_disp_cpu) ==
+					cpu_first_thread_sibling(disp_cpu)))
+		disp->same_cpu_disp++;
+	else {
+		distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
+								disp_cpu);
+		if (distance < 0)
+			pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
+					smp_processor_id());
+		else {
+			switch (distance) {
+			case 0:
+				disp->same_chip_disp++;
+				break;
+			case 1:
+				disp->diff_chip_disp++;
+				break;
+			case 2:
+				disp->far_chip_disp++;
+				break;
+			default:
+				pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
+						 smp_processor_id(),
+						 disp->last_disp_cpu,
+						 disp_cpu,
+						 distance);
+			}
+		}
+	}
+
+	distance = cpu_home_node_dispatch_distance(disp_cpu);
+	if (distance < 0)
+		pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
+				smp_processor_id());
+	else {
+		switch (distance) {
+		case 0:
+			disp->numa_home_disp++;
+			break;
+		case 1:
+			disp->numa_remote_disp++;
+			break;
+		case 2:
+			disp->numa_far_disp++;
+			break;
+		default:
+			pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
+						 smp_processor_id(),
+						 disp_cpu,
+						 distance);
+		}
+	}
+
+	disp->last_disp_cpu = disp_cpu;
+}
+
+static void process_dtl_buffer(struct work_struct *work)
+{
+	struct dtl_entry dtle;
+	u64 i = __this_cpu_read(dtl_entry_ridx);
+	struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
+	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
+	struct lppaca *vpa = local_paca->lppaca_ptr;
+	struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
+
+	if (!local_paca->dispatch_log)
+		return;
+
+	/* if we have been migrated away, we cancel ourself */
+	if (d->cpu != smp_processor_id()) {
+		pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
+						smp_processor_id());
+		return;
+	}
+
+	if (i == be64_to_cpu(vpa->dtl_idx))
+		goto out;
+
+	while (i < be64_to_cpu(vpa->dtl_idx)) {
+		dtle = *dtl;
+		barrier();
+		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
+			/* buffer has overflowed */
+			pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
+				d->cpu,
+				be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
+			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
+			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
+			continue;
+		}
+		update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
+		++i;
+		++dtl;
+		if (dtl == dtl_end)
+			dtl = local_paca->dispatch_log;
+	}
+
+	__this_cpu_write(dtl_entry_ridx, i);
+
+out:
+	schedule_delayed_work_on(d->cpu, to_delayed_work(work),
+					HZ / vcpudispatch_stats_freq);
+}
+
+static int dtl_worker_online(unsigned int cpu)
+{
+	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
+
+	memset(d, 0, sizeof(*d));
+	INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
+	d->cpu = cpu;
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	per_cpu(dtl_entry_ridx, cpu) = 0;
+	register_dtl_buffer(cpu);
+#else
+	per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
+#endif
+
+	schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
+	return 0;
+}
+
+static int dtl_worker_offline(unsigned int cpu)
+{
+	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
+
+	cancel_delayed_work_sync(&d->work);
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	unregister_dtl(get_hard_smp_processor_id(cpu));
+#endif
+
+	return 0;
+}
+
+static void set_global_dtl_mask(u8 mask)
+{
+	int cpu;
+
+	dtl_mask = mask;
+	for_each_present_cpu(cpu)
+		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
+}
+
+static void reset_global_dtl_mask(void)
+{
+	int cpu;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	dtl_mask = DTL_LOG_PREEMPT;
+#else
+	dtl_mask = 0;
+#endif
+	for_each_present_cpu(cpu)
+		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
+}
+
+static int dtl_worker_enable(unsigned long *time_limit)
+{
+	int rc = 0, state;
+
+	if (!write_trylock(&dtl_access_lock)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	set_global_dtl_mask(DTL_LOG_ALL);
+
+	/* Setup dtl buffers and register those */
+	alloc_dtl_buffers(time_limit);
+
+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
+					dtl_worker_online, dtl_worker_offline);
+	if (state < 0) {
+		pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
+		free_dtl_buffers(time_limit);
+		reset_global_dtl_mask();
+		write_unlock(&dtl_access_lock);
+		rc = -EINVAL;
+		goto out;
+	}
+	dtl_worker_state = state;
+
+out:
+	return rc;
+}
+
+static void dtl_worker_disable(unsigned long *time_limit)
+{
+	cpuhp_remove_state(dtl_worker_state);
+	free_dtl_buffers(time_limit);
+	reset_global_dtl_mask();
+	write_unlock(&dtl_access_lock);
+}
+
+static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
+		size_t count, loff_t *ppos)
+{
+	unsigned long time_limit = jiffies + HZ;
+	struct vcpu_dispatch_data *disp;
+	int rc, cmd, cpu;
+	char buf[16];
+
+	if (count > 15)
+		return -EINVAL;
+
+	if (copy_from_user(buf, p, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+	rc = kstrtoint(buf, 0, &cmd);
+	if (rc || cmd < 0 || cmd > 1) {
+		pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
+		return rc ? rc : -EINVAL;
+	}
+
+	mutex_lock(&dtl_enable_mutex);
+
+	if ((cmd == 0 && !vcpudispatch_stats_on) ||
+			(cmd == 1 && vcpudispatch_stats_on))
+		goto out;
+
+	if (cmd) {
+		rc = init_cpu_associativity();
+		if (rc)
+			goto out;
+
+		for_each_possible_cpu(cpu) {
+			disp = per_cpu_ptr(&vcpu_disp_data, cpu);
+			memset(disp, 0, sizeof(*disp));
+			disp->last_disp_cpu = -1;
+		}
+
+		rc = dtl_worker_enable(&time_limit);
+		if (rc) {
+			destroy_cpu_associativity();
+			goto out;
+		}
+	} else {
+		dtl_worker_disable(&time_limit);
+		destroy_cpu_associativity();
+	}
+
+	vcpudispatch_stats_on = cmd;
+
+out:
+	mutex_unlock(&dtl_enable_mutex);
+	if (rc)
+		return rc;
+	return count;
+}
+
+static int vcpudispatch_stats_display(struct seq_file *p, void *v)
+{
+	int cpu;
+	struct vcpu_dispatch_data *disp;
+
+	if (!vcpudispatch_stats_on) {
+		seq_puts(p, "off\n");
+		return 0;
+	}
+
+	for_each_online_cpu(cpu) {
+		disp = per_cpu_ptr(&vcpu_disp_data, cpu);
+		seq_printf(p, "cpu%d", cpu);
+		seq_put_decimal_ull(p, " ", disp->total_disp);
+		seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
+		seq_put_decimal_ull(p, " ", disp->same_chip_disp);
+		seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
+		seq_put_decimal_ull(p, " ", disp->far_chip_disp);
+		seq_put_decimal_ull(p, " ", disp->numa_home_disp);
+		seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
+		seq_put_decimal_ull(p, " ", disp->numa_far_disp);
+		seq_puts(p, "\n");
+	}
+
+	return 0;
+}
+
+static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vcpudispatch_stats_display, NULL);
+}
+
+static const struct file_operations vcpudispatch_stats_proc_ops = {
+	.open		= vcpudispatch_stats_open,
+	.read		= seq_read,
+	.write		= vcpudispatch_stats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static ssize_t vcpudispatch_stats_freq_write(struct file *file,
+		const char __user *p, size_t count, loff_t *ppos)
+{
+	int rc, freq;
+	char buf[16];
+
+	if (count > 15)
+		return -EINVAL;
+
+	if (copy_from_user(buf, p, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+	rc = kstrtoint(buf, 0, &freq);
+	if (rc || freq < 1 || freq > HZ) {
+		pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
+				HZ);
+		return rc ? rc : -EINVAL;
+	}
+
+	vcpudispatch_stats_freq = freq;
+
+	return count;
+}
+
+static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
+{
+	seq_printf(p, "%d\n", vcpudispatch_stats_freq);
+	return 0;
+}
+
+static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vcpudispatch_stats_freq_display, NULL);
+}
+
+static const struct file_operations vcpudispatch_stats_freq_proc_ops = {
+	.open		= vcpudispatch_stats_freq_open,
+	.read		= seq_read,
+	.write		= vcpudispatch_stats_freq_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init vcpudispatch_stats_procfs_init(void)
+{
+	if (!lppaca_shared_proc(get_lppaca()))
+		return 0;
+
+	if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
+					&vcpudispatch_stats_proc_ops))
+		pr_err("vcpudispatch_stats: error creating procfs file\n");
+	else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
+					&vcpudispatch_stats_freq_proc_ops))
+		pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
+
+	return 0;
+}
+
+machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
+#endif /* CONFIG_PPC_SPLPAR */
+
 void vpa_init(int cpu)
 {
 	int hwcpu = get_hard_smp_processor_id(cpu);
 	unsigned long addr;
 	long ret;
-	struct paca_struct *pp;
-	struct dtl_entry *dtl;
 
 	/*
 	 * The spec says it "may be problematic" if CPU x registers the VPA of
@@ -99,22 +681,7 @@ void vpa_init(int cpu)
 	/*
 	 * Register dispatch trace log, if one has been allocated.
 	 */
-	pp = paca_ptrs[cpu];
-	dtl = pp->dispatch_log;
-	if (dtl) {
-		pp->dtl_ridx = 0;
-		pp->dtl_curr = dtl;
-		lppaca_of(cpu).dtl_idx = 0;
-
-		/* hypervisor reads buffer length from this field */
-		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
-		ret = register_dtl(hwcpu, __pa(dtl));
-		if (ret)
-			pr_err("WARNING: DTL registration of cpu %d (hw %d) "
-			       "failed with %ld\n", smp_processor_id(),
-			       hwcpu, ret);
-		lppaca_of(cpu).dtl_enable_mask = 2;
-	}
+	register_dtl_buffer(cpu);
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -846,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
 	return 0;
 }
 
-/* Must be called in user context */
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
 static int pseries_lpar_resize_hpt(unsigned long shift)
 {
 	struct hpt_resize_state state = {
@@ -900,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
 
 	t1 = ktime_get();
 
-	rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+				     &state, NULL);
 
 	t2 = ktime_get();
 
@@ -960,16 +1531,24 @@ void __init hpte_init_pseries(void)
 	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
 	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
-	register_process_table		 = pseries_lpar_register_process_table;
 
 	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
 		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
+
+	/*
+	 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+	 * to inform the hypervisor that we wish to use the HPT.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		pseries_lpar_register_process_table(0, 0, 0);
 }
 
 void radix_init_pseries(void)
 {
 	pr_info("Using radix MMU under hypervisor\n");
-	register_process_table = pseries_lpar_register_process_table;
+
+	pseries_lpar_register_process_table(__pa(process_tb),
+						0, PRTB_SIZE_SHIFT - 12);
 }
 
 #ifdef CONFIG_PPC_SMLPAR
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 0c48c8964783..b571285f6c14 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -6,8 +6,10 @@
  * Copyright (C) 2010 IBM Corporation
  */
 
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
+#include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/stat.h>
 #include <linux/completion.h>
@@ -19,6 +21,7 @@
 #include <asm/machdep.h>
 #include <asm/rtas.h>
 #include "pseries.h"
+#include "../../kernel/cacheinfo.h"
 
 static struct kobject *mobility_kobj;
 
@@ -205,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope)
 
 				prop_data += vd;
 			}
+
+			cond_resched();
 		}
+
+		cond_resched();
 	} while (rtas_rc == 1);
 
 	of_node_put(dn);
@@ -308,8 +315,12 @@ int pseries_devicetree_update(s32 scope)
 					add_dt_node(phandle, drc_index);
 					break;
 				}
+
+				cond_resched();
 			}
 		}
+
+		cond_resched();
 	} while (rc == 1);
 
 	kfree(rtas_buf);
@@ -335,11 +346,28 @@ void post_mobility_fixup(void)
 	if (rc)
 		printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
 
+	/*
+	 * We don't want CPUs to go online/offline while the device
+	 * tree is being updated.
+	 */
+	cpus_read_lock();
+
+	/*
+	 * It's common for the destination firmware to replace cache
+	 * nodes.  Release all of the cacheinfo hierarchy's references
+	 * before updating the device tree.
+	 */
+	cacheinfo_teardown();
+
 	rc = pseries_devicetree_update(MIGRATION_SCOPE);
 	if (rc)
 		printk(KERN_ERR "Post-mobility device tree update "
 			"failed: %d\n", rc);
 
+	cacheinfo_rebuild();
+
+	cpus_read_unlock();
+
 	/* Possibly switch to a new RFI flush type */
 	pseries_setup_rfi_flush();
 
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 96c53b23e58f..a5ac371a3f06 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/libnvdimm.h>
 #include <linux/platform_device.h>
+#include <linux/delay.h>
 
 #include <asm/plpar_wrappers.h>
 
@@ -28,6 +29,7 @@ struct papr_scm_priv {
 	uint64_t blocks;
 	uint64_t block_size;
 	int metadata_size;
+	bool is_volatile;
 
 	uint64_t bound_addr;
 
@@ -42,8 +44,9 @@ struct papr_scm_priv {
 static int drc_pmem_bind(struct papr_scm_priv *p)
 {
 	unsigned long ret[PLPAR_HCALL_BUFSIZE];
-	uint64_t rc, token;
 	uint64_t saved = 0;
+	uint64_t token;
+	int64_t rc;
 
 	/*
 	 * When the hypervisor cannot map all the requested memory in a single
@@ -63,6 +66,10 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
 	} while (rc == H_BUSY);
 
 	if (rc) {
+		/* H_OVERLAP needs a separate error path */
+		if (rc == H_OVERLAP)
+			return -EBUSY;
+
 		dev_err(&p->pdev->dev, "bind err: %lld\n", rc);
 		return -ENXIO;
 	}
@@ -77,61 +84,135 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
 static int drc_pmem_unbind(struct papr_scm_priv *p)
 {
 	unsigned long ret[PLPAR_HCALL_BUFSIZE];
-	uint64_t rc, token;
+	uint64_t token = 0;
+	int64_t rc;
 
-	token = 0;
+	dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index);
 
-	/* NB: unbind has the same retry requirements mentioned above */
+	/* NB: unbind has the same retry requirements as drc_pmem_bind() */
 	do {
-		rc = plpar_hcall(H_SCM_UNBIND_MEM, ret, p->drc_index,
-				p->bound_addr, p->blocks, token);
+
+		/* Unbind of all SCM resources associated with drcIndex */
+		rc = plpar_hcall(H_SCM_UNBIND_ALL, ret, H_UNBIND_SCOPE_DRC,
+				 p->drc_index, token);
 		token = ret[0];
-		cond_resched();
+
+		/* Check if we are stalled for some time */
+		if (H_IS_LONG_BUSY(rc)) {
+			msleep(get_longbusy_msecs(rc));
+			rc = H_BUSY;
+		} else if (rc == H_BUSY) {
+			cond_resched();
+		}
+
 	} while (rc == H_BUSY);
 
 	if (rc)
 		dev_err(&p->pdev->dev, "unbind error: %lld\n", rc);
+	else
+		dev_dbg(&p->pdev->dev, "unbind drc %x complete\n",
+			p->drc_index);
 
-	return !!rc;
+	return rc == H_SUCCESS ? 0 : -ENXIO;
 }
 
 static int papr_scm_meta_get(struct papr_scm_priv *p,
-			struct nd_cmd_get_config_data_hdr *hdr)
+			     struct nd_cmd_get_config_data_hdr *hdr)
 {
 	unsigned long data[PLPAR_HCALL_BUFSIZE];
+	unsigned long offset, data_offset;
+	int len, read;
 	int64_t ret;
 
-	if (hdr->in_offset >= p->metadata_size || hdr->in_length != 1)
+	if ((hdr->in_offset + hdr->in_length) >= p->metadata_size)
 		return -EINVAL;
 
-	ret = plpar_hcall(H_SCM_READ_METADATA, data, p->drc_index,
-			hdr->in_offset, 1);
-
-	if (ret == H_PARAMETER) /* bad DRC index */
-		return -ENODEV;
-	if (ret)
-		return -EINVAL; /* other invalid parameter */
-
-	hdr->out_buf[0] = data[0] & 0xff;
-
+	for (len = hdr->in_length; len; len -= read) {
+
+		data_offset = hdr->in_length - len;
+		offset = hdr->in_offset + data_offset;
+
+		if (len >= 8)
+			read = 8;
+		else if (len >= 4)
+			read = 4;
+		else if (len >= 2)
+			read = 2;
+		else
+			read = 1;
+
+		ret = plpar_hcall(H_SCM_READ_METADATA, data, p->drc_index,
+				  offset, read);
+
+		if (ret == H_PARAMETER) /* bad DRC index */
+			return -ENODEV;
+		if (ret)
+			return -EINVAL; /* other invalid parameter */
+
+		switch (read) {
+		case 8:
+			*(uint64_t *)(hdr->out_buf + data_offset) = be64_to_cpu(data[0]);
+			break;
+		case 4:
+			*(uint32_t *)(hdr->out_buf + data_offset) = be32_to_cpu(data[0] & 0xffffffff);
+			break;
+
+		case 2:
+			*(uint16_t *)(hdr->out_buf + data_offset) = be16_to_cpu(data[0] & 0xffff);
+			break;
+
+		case 1:
+			*(uint8_t *)(hdr->out_buf + data_offset) = (data[0] & 0xff);
+			break;
+		}
+	}
 	return 0;
 }
 
 static int papr_scm_meta_set(struct papr_scm_priv *p,
-			struct nd_cmd_set_config_hdr *hdr)
+			     struct nd_cmd_set_config_hdr *hdr)
 {
+	unsigned long offset, data_offset;
+	int len, wrote;
+	unsigned long data;
+	__be64 data_be;
 	int64_t ret;
 
-	if (hdr->in_offset >= p->metadata_size || hdr->in_length != 1)
+	if ((hdr->in_offset + hdr->in_length) >= p->metadata_size)
 		return -EINVAL;
 
-	ret = plpar_hcall_norets(H_SCM_WRITE_METADATA,
-			p->drc_index, hdr->in_offset, hdr->in_buf[0], 1);
-
-	if (ret == H_PARAMETER) /* bad DRC index */
-		return -ENODEV;
-	if (ret)
-		return -EINVAL; /* other invalid parameter */
+	for (len = hdr->in_length; len; len -= wrote) {
+
+		data_offset = hdr->in_length - len;
+		offset = hdr->in_offset + data_offset;
+
+		if (len >= 8) {
+			data = *(uint64_t *)(hdr->in_buf + data_offset);
+			data_be = cpu_to_be64(data);
+			wrote = 8;
+		} else if (len >= 4) {
+			data = *(uint32_t *)(hdr->in_buf + data_offset);
+			data &= 0xffffffff;
+			data_be = cpu_to_be32(data);
+			wrote = 4;
+		} else if (len >= 2) {
+			data = *(uint16_t *)(hdr->in_buf + data_offset);
+			data &= 0xffff;
+			data_be = cpu_to_be16(data);
+			wrote = 2;
+		} else {
+			data_be = *(uint8_t *)(hdr->in_buf + data_offset);
+			data_be &= 0xff;
+			wrote = 1;
+		}
+
+		ret = plpar_hcall_norets(H_SCM_WRITE_METADATA, p->drc_index,
+					 offset, data_be, wrote);
+		if (ret == H_PARAMETER) /* bad DRC index */
+			return -ENODEV;
+		if (ret)
+			return -EINVAL; /* other invalid parameter */
+	}
 
 	return 0;
 }
@@ -153,7 +234,7 @@ int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
 		get_size_hdr = buf;
 
 		get_size_hdr->status = 0;
-		get_size_hdr->max_xfer = 1;
+		get_size_hdr->max_xfer = 8;
 		get_size_hdr->config_size = p->metadata_size;
 		*cmd_rc = 0;
 		break;
@@ -194,12 +275,32 @@ static const struct attribute_group *papr_scm_dimm_groups[] = {
 	NULL,
 };
 
+static inline int papr_scm_node(int node)
+{
+	int min_dist = INT_MAX, dist;
+	int nid, min_node;
+
+	if ((node == NUMA_NO_NODE) || node_online(node))
+		return node;
+
+	min_node = first_online_node;
+	for_each_online_node(nid) {
+		dist = node_distance(node, nid);
+		if (dist < min_dist) {
+			min_dist = dist;
+			min_node = nid;
+		}
+	}
+	return min_node;
+}
+
 static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 {
 	struct device *dev = &p->pdev->dev;
 	struct nd_mapping_desc mapping;
 	struct nd_region_desc ndr_desc;
 	unsigned long dimm_flags;
+	int target_nid, online_nid;
 
 	p->bus_desc.ndctl = papr_scm_ndctl;
 	p->bus_desc.module = THIS_MODULE;
@@ -238,8 +339,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 
 	memset(&ndr_desc, 0, sizeof(ndr_desc));
 	ndr_desc.attr_groups = region_attr_groups;
-	ndr_desc.numa_node = dev_to_node(&p->pdev->dev);
-	ndr_desc.target_node = ndr_desc.numa_node;
+	target_nid = dev_to_node(&p->pdev->dev);
+	online_nid = papr_scm_node(target_nid);
+	ndr_desc.numa_node = online_nid;
+	ndr_desc.target_node = target_nid;
 	ndr_desc.res = &p->res;
 	ndr_desc.of_node = p->dn;
 	ndr_desc.provider_data = p;
@@ -248,12 +351,18 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	ndr_desc.nd_set = &p->nd_set;
 	set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
 
-	p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc);
+	if (p->is_volatile)
+		p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc);
+	else
+		p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc);
 	if (!p->region) {
 		dev_err(dev, "Error registering region %pR from %pOF\n",
 				ndr_desc.res, p->dn);
 		goto err;
 	}
+	if (target_nid != online_nid)
+		dev_info(dev, "Region registered with target node %d and online node %d",
+			 target_nid, online_nid);
 
 	return 0;
 
@@ -293,6 +402,7 @@ static int papr_scm_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
+
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
@@ -304,11 +414,19 @@ static int papr_scm_probe(struct platform_device *pdev)
 	p->drc_index = drc_index;
 	p->block_size = block_size;
 	p->blocks = blocks;
+	p->is_volatile = !of_property_read_bool(dn, "ibm,cache-flush-required");
 
 	/* We just need to ensure that set cookies are unique across */
 	uuid_parse(uuid_str, (uuid_t *) uuid);
-	p->nd_set.cookie1 = uuid[0];
-	p->nd_set.cookie2 = uuid[1];
+	/*
+	 * cookie1 and cookie2 are not really little endian
+	 * we store a little endian representation of the
+	 * uuid str so that we can compare this with the label
+	 * area cookie irrespective of the endian config with which
+	 * the kernel is built.
+	 */
+	p->nd_set.cookie1 = cpu_to_le64(uuid[0]);
+	p->nd_set.cookie2 = cpu_to_le64(uuid[1]);
 
 	/* might be zero */
 	p->metadata_size = metadata_size;
@@ -316,6 +434,14 @@ static int papr_scm_probe(struct platform_device *pdev)
 
 	/* request the hypervisor to bind this region to somewhere in memory */
 	rc = drc_pmem_bind(p);
+
+	/* If phyp says drc memory still bound then force unbound and retry */
+	if (rc == -EBUSY) {
+		dev_warn(&pdev->dev, "Retrying bind after unbinding\n");
+		drc_pmem_unbind(p);
+		rc = drc_pmem_bind(p);
+	}
+
 	if (rc)
 		goto err;
 
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 1eae1d09980c..722830978639 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void)
 
 	pSeries_request_regions();
 
-	eeh_probe_devices();
-	eeh_addr_cache_build();
+	eeh_show_enabled();
 
 #ifdef CONFIG_PCI_IOV
 	ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f16fdd0f71f7..3acdcc3bb908 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -76,6 +76,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_UE		0x00
 #define MC_ERROR_TYPE_SLB		0x01
 #define MC_ERROR_TYPE_ERAT		0x02
+#define MC_ERROR_TYPE_UNKNOWN		0x03
 #define MC_ERROR_TYPE_TLB		0x04
 #define MC_ERROR_TYPE_D_CACHE		0x05
 #define MC_ERROR_TYPE_I_CACHE		0x07
@@ -87,6 +88,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_UE_LOAD_STORE			3
 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE	4
 
+#define UE_EFFECTIVE_ADDR_PROVIDED		0x40
+#define UE_LOGICAL_ADDR_PROVIDED		0x20
+
 #define MC_ERROR_SLB_PARITY		0
 #define MC_ERROR_SLB_MULTIHIT		1
 #define MC_ERROR_SLB_INDETERMINATE	2
@@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 	}
 }
 
-static
-inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog)
-{
-	__be64 addr = 0;
-
-	switch (mlog->error_type) {
-	case	MC_ERROR_TYPE_UE:
-		if (mlog->sub_err_type & 0x40)
-			addr = mlog->effective_address;
-		break;
-	case	MC_ERROR_TYPE_SLB:
-	case	MC_ERROR_TYPE_ERAT:
-	case	MC_ERROR_TYPE_TLB:
-		if (mlog->sub_err_type & 0x80)
-			addr = mlog->effective_address;
-	default:
-		break;
-	}
-	return be64_to_cpu(addr);
-}
-
 /*
  * Enable the hotplug interrupt late because processing them may touch other
  * devices or systems (e.g. hugepages) that have not been initialized at the
@@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
-#define VAL_TO_STRING(ar, val)	\
-	(((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown")
 
-static void pseries_print_mce_info(struct pt_regs *regs,
-				   struct rtas_error_log *errp)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
-	const char *level, *sevstr;
+	struct mce_error_info mce_err = { 0 };
+	unsigned long eaddr = 0, paddr = 0;
 	struct pseries_errorlog *pseries_log;
 	struct pseries_mc_errorlog *mce_log;
-	u8 error_type, err_sub_type;
-	u64 addr;
-	u8 initiator = rtas_error_initiator(errp);
 	int disposition = rtas_error_disposition(errp);
+	int initiator = rtas_error_initiator(errp);
+	int severity = rtas_error_severity(errp);
+	u8 error_type, err_sub_type;
 
-	static const char * const initiators[] = {
-		[0] = "Unknown",
-		[1] = "CPU",
-		[2] = "PCI",
-		[3] = "ISA",
-		[4] = "Memory",
-		[5] = "Power Mgmt",
-	};
-	static const char * const mc_err_types[] = {
-		[0] = "UE",
-		[1] = "SLB",
-		[2] = "ERAT",
-		[3] = "Unknown",
-		[4] = "TLB",
-		[5] = "D-Cache",
-		[6] = "Unknown",
-		[7] = "I-Cache",
-	};
-	static const char * const mc_ue_types[] = {
-		[0] = "Indeterminate",
-		[1] = "Instruction fetch",
-		[2] = "Page table walk ifetch",
-		[3] = "Load/Store",
-		[4] = "Page table walk Load/Store",
-	};
-
-	/* SLB sub errors valid values are 0x0, 0x1, 0x2 */
-	static const char * const mc_slb_types[] = {
-		[0] = "Parity",
-		[1] = "Multihit",
-		[2] = "Indeterminate",
-	};
-
-	/* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
-	static const char * const mc_soft_types[] = {
-		[0] = "Unknown",
-		[1] = "Parity",
-		[2] = "Multihit",
-		[3] = "Indeterminate",
-	};
-
-	if (!rtas_error_extended(errp)) {
-		pr_err("Machine check interrupt: Missing extended error log\n");
-		return;
-	}
+	if (initiator == RTAS_INITIATOR_UNKNOWN)
+		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
+	else if (initiator == RTAS_INITIATOR_CPU)
+		mce_err.initiator = MCE_INITIATOR_CPU;
+	else if (initiator == RTAS_INITIATOR_PCI)
+		mce_err.initiator = MCE_INITIATOR_PCI;
+	else if (initiator == RTAS_INITIATOR_ISA)
+		mce_err.initiator = MCE_INITIATOR_ISA;
+	else if (initiator == RTAS_INITIATOR_MEMORY)
+		mce_err.initiator = MCE_INITIATOR_MEMORY;
+	else if (initiator == RTAS_INITIATOR_POWERMGM)
+		mce_err.initiator = MCE_INITIATOR_POWERMGM;
+	else
+		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
+
+	if (severity == RTAS_SEVERITY_NO_ERROR)
+		mce_err.severity = MCE_SEV_NO_ERROR;
+	else if (severity == RTAS_SEVERITY_EVENT)
+		mce_err.severity = MCE_SEV_WARNING;
+	else if (severity == RTAS_SEVERITY_WARNING)
+		mce_err.severity = MCE_SEV_WARNING;
+	else if (severity == RTAS_SEVERITY_ERROR_SYNC)
+		mce_err.severity = MCE_SEV_SEVERE;
+	else if (severity == RTAS_SEVERITY_ERROR)
+		mce_err.severity = MCE_SEV_SEVERE;
+	else if (severity == RTAS_SEVERITY_FATAL)
+		mce_err.severity = MCE_SEV_FATAL;
+	else
+		mce_err.severity = MCE_SEV_FATAL;
+
+	if (severity <= RTAS_SEVERITY_ERROR_SYNC)
+		mce_err.sync_error = true;
+	else
+		mce_err.sync_error = false;
+
+	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err.error_class = MCE_ECLASS_UNKNOWN;
+
+	if (!rtas_error_extended(errp))
+		goto out;
 
 	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
 	if (pseries_log == NULL)
-		return;
+		goto out;
 
 	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-
 	error_type = mce_log->error_type;
 	err_sub_type = rtas_mc_error_sub_type(mce_log);
 
-	switch (rtas_error_severity(errp)) {
-	case RTAS_SEVERITY_NO_ERROR:
-		level = KERN_INFO;
-		sevstr = "Harmless";
-		break;
-	case RTAS_SEVERITY_WARNING:
-		level = KERN_WARNING;
-		sevstr = "";
-		break;
-	case RTAS_SEVERITY_ERROR:
-	case RTAS_SEVERITY_ERROR_SYNC:
-		level = KERN_ERR;
-		sevstr = "Severe";
-		break;
-	case RTAS_SEVERITY_FATAL:
-	default:
-		level = KERN_ERR;
-		sevstr = "Fatal";
-		break;
-	}
+	switch (mce_log->error_type) {
+	case MC_ERROR_TYPE_UE:
+		mce_err.error_type = MCE_ERROR_TYPE_UE;
+		switch (err_sub_type) {
+		case MC_ERROR_UE_IFETCH:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
+			break;
+		case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+			break;
+		case MC_ERROR_UE_LOAD_STORE:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+			break;
+		case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+			break;
+		case MC_ERROR_UE_INDETERMINATE:
+		default:
+			mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
+			eaddr = be64_to_cpu(mce_log->effective_address);
 
-#ifdef CONFIG_PPC_BOOK3S_64
-	/* Display faulty slb contents for SLB errors. */
-	if (error_type == MC_ERROR_TYPE_SLB)
-		slb_dump_contents(local_paca->mce_faulty_slbs);
-#endif
+		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+			paddr = be64_to_cpu(mce_log->logical_address);
+		} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+			unsigned long pfn;
 
-	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-	       disposition == RTAS_DISP_FULLY_RECOVERED ?
-	       "Recovered" : "Not recovered");
-	if (user_mode(regs)) {
-		printk("%s  NIP: [%016lx] PID: %d Comm: %s\n", level,
-		       regs->nip, current->pid, current->comm);
-	} else {
-		printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
-		       (void *)regs->nip);
-	}
-	printk("%s  Initiator: %s\n", level,
-	       VAL_TO_STRING(initiators, initiator));
+			pfn = addr_to_pfn(regs, eaddr);
+			if (pfn != ULONG_MAX)
+				paddr = pfn << PAGE_SHIFT;
+		}
 
-	switch (error_type) {
-	case MC_ERROR_TYPE_UE:
-		printk("%s  Error type: %s [%s]\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type),
-		       VAL_TO_STRING(mc_ue_types, err_sub_type));
 		break;
 	case MC_ERROR_TYPE_SLB:
-		printk("%s  Error type: %s [%s]\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type),
-		       VAL_TO_STRING(mc_slb_types, err_sub_type));
+		mce_err.error_type = MCE_ERROR_TYPE_SLB;
+		switch (err_sub_type) {
+		case MC_ERROR_SLB_PARITY:
+			mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY;
+			break;
+		case MC_ERROR_SLB_MULTIHIT:
+			mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+			break;
+		case MC_ERROR_SLB_INDETERMINATE:
+		default:
+			mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & 0x80)
+			eaddr = be64_to_cpu(mce_log->effective_address);
 		break;
 	case MC_ERROR_TYPE_ERAT:
+		mce_err.error_type = MCE_ERROR_TYPE_ERAT;
+		switch (err_sub_type) {
+		case MC_ERROR_ERAT_PARITY:
+			mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY;
+			break;
+		case MC_ERROR_ERAT_MULTIHIT:
+			mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+			break;
+		case MC_ERROR_ERAT_INDETERMINATE:
+		default:
+			mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & 0x80)
+			eaddr = be64_to_cpu(mce_log->effective_address);
+		break;
 	case MC_ERROR_TYPE_TLB:
-		printk("%s  Error type: %s [%s]\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type),
-		       VAL_TO_STRING(mc_soft_types, err_sub_type));
+		mce_err.error_type = MCE_ERROR_TYPE_TLB;
+		switch (err_sub_type) {
+		case MC_ERROR_TLB_PARITY:
+			mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY;
+			break;
+		case MC_ERROR_TLB_MULTIHIT:
+			mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+			break;
+		case MC_ERROR_TLB_INDETERMINATE:
+		default:
+			mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
+			break;
+		}
+		if (mce_log->sub_err_type & 0x80)
+			eaddr = be64_to_cpu(mce_log->effective_address);
+		break;
+	case MC_ERROR_TYPE_D_CACHE:
+		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 		break;
+	case MC_ERROR_TYPE_I_CACHE:
+		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+		break;
+	case MC_ERROR_TYPE_UNKNOWN:
 	default:
-		printk("%s  Error type: %s\n", level,
-		       VAL_TO_STRING(mc_err_types, error_type));
+		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 		break;
 	}
 
-	addr = rtas_mc_get_effective_addr(mce_log);
-	if (addr)
-		printk("%s    Effective address: %016llx\n", level, addr);
-}
-
-static int mce_handle_error(struct rtas_error_log *errp)
-{
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log;
-	int disposition = rtas_error_disposition(errp);
-	u8 error_type;
-
-	if (!rtas_error_extended(errp))
-		goto out;
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (pseries_log == NULL)
-		goto out;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-	error_type = mce_log->error_type;
-
 #ifdef CONFIG_PPC_BOOK3S_64
 	if (disposition == RTAS_DISP_NOT_RECOVERED) {
 		switch (error_type) {
@@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp)
 				slb_save_contents(local_paca->mce_faulty_slbs);
 			flush_and_reload_slb();
 			disposition = RTAS_DISP_FULLY_RECOVERED;
-			rtas_set_disposition_recovered(errp);
 			break;
 		default:
 			break;
 		}
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		printk(KERN_ERR "MCE: limited recovery, system may "
+		       "be degraded\n");
+		disposition = RTAS_DISP_FULLY_RECOVERED;
 	}
 #endif
 
 out:
-	return disposition;
-}
-
-#ifdef CONFIG_MEMORY_FAILURE
-
-static DEFINE_PER_CPU(int, rtas_ue_count);
-static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
+	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
+			&mce_err, regs->nip, eaddr, paddr);
 
-#define UE_EFFECTIVE_ADDR_PROVIDED	0x40
-#define UE_LOGICAL_ADDR_PROVIDED	0x20
-
-
-static void pseries_hwpoison_work_fn(struct work_struct *work)
-{
-	unsigned long paddr;
-	int index;
-
-	while (__this_cpu_read(rtas_ue_count) > 0) {
-		index = __this_cpu_read(rtas_ue_count) - 1;
-		paddr = __this_cpu_read(rtas_ue_paddr[index]);
-		memory_failure(paddr >> PAGE_SHIFT, 0);
-		__this_cpu_dec(rtas_ue_count);
-	}
-}
-
-static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
-
-static void queue_ue_paddr(unsigned long paddr)
-{
-	int index;
-
-	index = __this_cpu_inc_return(rtas_ue_count) - 1;
-	if (index >= MAX_MC_EVT) {
-		__this_cpu_dec(rtas_ue_count);
-		return;
-	}
-	this_cpu_write(rtas_ue_paddr[index], paddr);
-	schedule_work(&hwpoison_work);
-}
-
-static void pseries_do_memory_failure(struct pt_regs *regs,
-				      struct pseries_mc_errorlog *mce_log)
-{
-	unsigned long paddr;
-
-	if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
-		paddr = be64_to_cpu(mce_log->logical_address);
-	} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
-		unsigned long pfn;
-
-		pfn = addr_to_pfn(regs,
-				  be64_to_cpu(mce_log->effective_address));
-		if (pfn == ULONG_MAX)
-			return;
-		paddr = pfn << PAGE_SHIFT;
-	} else {
-		return;
-	}
-	queue_ue_paddr(paddr);
-}
-
-static void pseries_process_ue(struct pt_regs *regs,
-			       struct rtas_error_log *errp)
-{
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log;
-
-	if (!rtas_error_extended(errp))
-		return;
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (!pseries_log)
-		return;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-
-	if (mce_log->error_type == MC_ERROR_TYPE_UE)
-		pseries_do_memory_failure(regs, mce_log);
+	return disposition;
 }
-#else
-static inline void pseries_process_ue(struct pt_regs *regs,
-				      struct rtas_error_log *errp) { }
-#endif /*CONFIG_MEMORY_FAILURE */
 
 /*
  * Process MCE rtas errlog event.
@@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work)
  * Return 1 if corrected (or delivered a signal).
  * Return 0 if there is nothing we can do.
  */
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
 {
 	int recovered = 0;
-	int disposition = rtas_error_disposition(err);
-
-	pseries_print_mce_info(regs, err);
 
 	if (!(regs->msr & MSR_RI)) {
 		/* If MSR_RI isn't set, we cannot recover */
 		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
 		recovered = 0;
-
-	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
+	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
 		/* Platform corrected itself */
 		recovered = 1;
+	} else if (evt->severity == MCE_SEV_FATAL) {
+		/* Fatal machine check */
+		pr_err("Machine check interrupt is fatal\n");
+		recovered = 0;
+	}
 
-	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-		/* Platform corrected itself but could be degraded */
-		printk(KERN_ERR "MCE: limited recovery, system may "
-		       "be degraded\n");
-		recovered = 1;
-
-	} else if (user_mode(regs) && !is_global_init(current) &&
-		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
-
+	if (!recovered && evt->sync_error) {
 		/*
-		 * If we received a synchronous error when in userspace
-		 * kill the task. Firmware may report details of the fail
-		 * asynchronously, so we can't rely on the target and type
-		 * fields being valid here.
+		 * Try to kill processes if we get a synchronous machine check
+		 * (e.g., one caused by execution of this instruction). This
+		 * will devolve into a panic if we try to kill init or are in
+		 * an interrupt etc.
+		 *
+		 * TODO: Queue up this address for hwpoisioning later.
+		 * TODO: This is not quite right for d-side machine
+		 *       checks ->nip is not necessarily the important
+		 *       address.
 		 */
-		printk(KERN_ERR "MCE: uncorrectable error, killing task "
-		       "%s:%d\n", current->comm, current->pid);
-
-		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
-		recovered = 1;
+		if ((user_mode(regs))) {
+			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+			recovered = 1;
+		} else if (die_will_crash()) {
+			/*
+			 * die() would kill the kernel, so better to go via
+			 * the platform reboot code that will log the
+			 * machine check.
+			 */
+			recovered = 0;
+		} else {
+			die("Machine check", regs, SIGBUS);
+			recovered = 1;
+		}
 	}
 
-	pseries_process_ue(regs, err);
-
-	/* Queue irq work to log this rtas event later. */
-	irq_work_queue(&mce_errlog_process_work);
-
 	return recovered;
 }
 
@@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
  */
 int pSeries_machine_check_exception(struct pt_regs *regs)
 {
-	struct rtas_error_log *errp;
+	struct machine_check_event evt;
 
-	if (fwnmi_active) {
-		fwnmi_release_errinfo();
-		errp = fwnmi_get_errlog();
-		if (errp && recover_mce(regs, errp))
-			return 1;
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return 0;
+
+	/* Print things out */
+	if (evt.version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt.version);
+		return 0;
 	}
+	machine_check_print_event_info(&evt, user_mode(regs), false);
+
+	if (recover_mce(regs, &evt))
+		return 1;
 
 	return 0;
 }
@@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs)
 		 * to panic. Hence we will call it as soon as we go into
 		 * virtual mode.
 		 */
-		disposition = mce_handle_error(errp);
+		disposition = mce_handle_error(regs, errp);
+		fwnmi_release_errinfo();
+
+		/* Queue irq work to log this rtas event later. */
+		irq_work_queue(&mce_errlog_process_work);
+
 		if (disposition == RTAS_DISP_FULLY_RECOVERED)
 			return 1;
 	}
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
new file mode 100644
index 000000000000..70c3013fdd07
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWERVM platform.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "rtas fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+#include <asm/fadump-internal.h>
+
+#include "rtas-fadump.h"
+
+static struct rtas_fadump_mem_struct fdm;
+static const struct rtas_fadump_mem_struct *fdm_active;
+
+static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
+				      const struct rtas_fadump_mem_struct *fdm)
+{
+	fadump_conf->boot_mem_dest_addr =
+		be64_to_cpu(fdm->rmr_region.destination_address);
+
+	fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr +
+				       fadump_conf->boot_memory_size);
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * setup in the first kernel and passed to the f/w.
+ */
+static void rtas_fadump_get_config(struct fw_dump *fadump_conf,
+				   const struct rtas_fadump_mem_struct *fdm)
+{
+	fadump_conf->boot_mem_addr[0] =
+		be64_to_cpu(fdm->rmr_region.source_address);
+	fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len);
+	fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0];
+
+	fadump_conf->boot_mem_top = fadump_conf->boot_memory_size;
+	fadump_conf->boot_mem_regs_cnt = 1;
+
+	/*
+	 * Start address of reserve dump area (permanent reservation) for
+	 * re-registering FADump after dump capture.
+	 */
+	fadump_conf->reserve_dump_area_start =
+		be64_to_cpu(fdm->cpu_state_data.destination_address);
+
+	rtas_fadump_update_config(fadump_conf, fdm);
+}
+
+static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+	u64 addr = fadump_conf->reserve_dump_area_start;
+
+	memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct));
+	addr = addr & PAGE_MASK;
+
+	fdm.header.dump_format_version = cpu_to_be32(0x00000001);
+	fdm.header.dump_num_sections = cpu_to_be16(3);
+	fdm.header.dump_status_flag = 0;
+	fdm.header.offset_first_dump_section =
+		cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct,
+					  cpu_state_data));
+
+	/*
+	 * Fields for disk dump option.
+	 * We are not using disk dump option, hence set these fields to 0.
+	 */
+	fdm.header.dd_block_size = 0;
+	fdm.header.dd_block_offset = 0;
+	fdm.header.dd_num_blocks = 0;
+	fdm.header.dd_offset_disk_path = 0;
+
+	/* set 0 to disable an automatic dump-reboot. */
+	fdm.header.max_time_auto = 0;
+
+	/* Kernel dump sections */
+	/* cpu state data section. */
+	fdm.cpu_state_data.request_flag =
+		cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm.cpu_state_data.source_data_type =
+		cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
+	fdm.cpu_state_data.source_address = 0;
+	fdm.cpu_state_data.source_len =
+		cpu_to_be64(fadump_conf->cpu_state_data_size);
+	fdm.cpu_state_data.destination_address = cpu_to_be64(addr);
+	addr += fadump_conf->cpu_state_data_size;
+
+	/* hpte region section */
+	fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm.hpte_region.source_data_type =
+		cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
+	fdm.hpte_region.source_address = 0;
+	fdm.hpte_region.source_len =
+		cpu_to_be64(fadump_conf->hpte_region_size);
+	fdm.hpte_region.destination_address = cpu_to_be64(addr);
+	addr += fadump_conf->hpte_region_size;
+
+	/* RMA region section */
+	fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+	fdm.rmr_region.source_data_type =
+		cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
+	fdm.rmr_region.source_address = cpu_to_be64(0);
+	fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size);
+	fdm.rmr_region.destination_address = cpu_to_be64(addr);
+	addr += fadump_conf->boot_memory_size;
+
+	rtas_fadump_update_config(fadump_conf, &fdm);
+
+	return addr;
+}
+
+static u64 rtas_fadump_get_bootmem_min(void)
+{
+	return RTAS_FADUMP_MIN_BOOT_MEM;
+}
+
+static int rtas_fadump_register(struct fw_dump *fadump_conf)
+{
+	unsigned int wait_time;
+	int rc, err = -EIO;
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc =  rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+				NULL, FADUMP_REGISTER, &fdm,
+				sizeof(struct rtas_fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+
+	} while (wait_time);
+
+	switch (rc) {
+	case 0:
+		pr_info("Registration is successful!\n");
+		fadump_conf->dump_registered = 1;
+		err = 0;
+		break;
+	case -1:
+		pr_err("Failed to register. Hardware Error(%d).\n", rc);
+		break;
+	case -3:
+		if (!is_fadump_boot_mem_contiguous())
+			pr_err("Can't have holes in boot memory area.\n");
+		else if (!is_fadump_reserved_mem_contiguous())
+			pr_err("Can't have holes in reserved memory area.\n");
+
+		pr_err("Failed to register. Parameter Error(%d).\n", rc);
+		err = -EINVAL;
+		break;
+	case -9:
+		pr_err("Already registered!\n");
+		fadump_conf->dump_registered = 1;
+		err = -EEXIST;
+		break;
+	default:
+		pr_err("Failed to register. Unknown Error(%d).\n", rc);
+		break;
+	}
+
+	return err;
+}
+
+static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
+{
+	unsigned int wait_time;
+	int rc;
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc =  rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+				NULL, FADUMP_UNREGISTER, &fdm,
+				sizeof(struct rtas_fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		pr_err("Failed to un-register - unexpected error(%d).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_registered = 0;
+	return 0;
+}
+
+static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+	unsigned int wait_time;
+	int rc;
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc =  rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+				NULL, FADUMP_INVALIDATE, fdm_active,
+				sizeof(struct rtas_fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		pr_err("Failed to invalidate - unexpected error (%d).\n", rc);
+		return -EIO;
+	}
+
+	fadump_conf->dump_active = 0;
+	fdm_active = NULL;
+	return 0;
+}
+
+#define RTAS_FADUMP_GPR_MASK		0xffffff0000000000
+static inline int rtas_fadump_gpr_index(u64 id)
+{
+	char str[3];
+	int i = -1;
+
+	if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) {
+		/* get the digits at the end */
+		id &= ~RTAS_FADUMP_GPR_MASK;
+		id >>= 24;
+		str[2] = '\0';
+		str[1] = id & 0xff;
+		str[0] = (id >> 8) & 0xff;
+		if (kstrtoint(str, 10, &i))
+			i = -EINVAL;
+		if (i > 31)
+			i = -1;
+	}
+	return i;
+}
+
+void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val)
+{
+	int i;
+
+	i = rtas_fadump_gpr_index(reg_id);
+	if (i >= 0)
+		regs->gpr[i] = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("NIA"))
+		regs->nip = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("MSR"))
+		regs->msr = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("CTR"))
+		regs->ctr = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("LR"))
+		regs->link = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("XER"))
+		regs->xer = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("CR"))
+		regs->ccr = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("DAR"))
+		regs->dar = (unsigned long)reg_val;
+	else if (reg_id == fadump_str_to_u64("DSISR"))
+		regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct rtas_fadump_reg_entry*
+rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry,
+		      struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+
+	while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
+		rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
+				       be64_to_cpu(reg_entry->reg_value));
+		reg_entry++;
+	}
+	reg_entry++;
+	return reg_entry;
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
+{
+	struct rtas_fadump_reg_save_area_header *reg_header;
+	struct fadump_crash_info_header *fdh = NULL;
+	struct rtas_fadump_reg_entry *reg_entry;
+	u32 num_cpus, *note_buf;
+	int i, rc = 0, cpu = 0;
+	struct pt_regs regs;
+	unsigned long addr;
+	void *vaddr;
+
+	addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
+	vaddr = __va(addr);
+
+	reg_header = vaddr;
+	if (be64_to_cpu(reg_header->magic_number) !=
+	    fadump_str_to_u64("REGSAVE")) {
+		pr_err("Unable to read register save area.\n");
+		return -ENOENT;
+	}
+
+	pr_debug("--------CPU State Data------------\n");
+	pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
+	pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
+
+	vaddr += be32_to_cpu(reg_header->num_cpu_offset);
+	num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
+	pr_debug("NumCpus     : %u\n", num_cpus);
+	vaddr += sizeof(u32);
+	reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
+
+	rc = fadump_setup_cpu_notes_buf(num_cpus);
+	if (rc != 0)
+		return rc;
+
+	note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+
+	if (fadump_conf->fadumphdr_addr)
+		fdh = __va(fadump_conf->fadumphdr_addr);
+
+	for (i = 0; i < num_cpus; i++) {
+		if (be64_to_cpu(reg_entry->reg_id) !=
+		    fadump_str_to_u64("CPUSTRT")) {
+			pr_err("Unable to read CPU state data\n");
+			rc = -ENOENT;
+			goto error_out;
+		}
+		/* Lower 4 bytes of reg_value contains logical cpu id */
+		cpu = (be64_to_cpu(reg_entry->reg_value) &
+		       RTAS_FADUMP_CPU_ID_MASK);
+		if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
+			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
+			continue;
+		}
+		pr_debug("Reading register data for cpu %d...\n", cpu);
+		if (fdh && fdh->crashing_cpu == cpu) {
+			regs = fdh->regs;
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+			RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
+		} else {
+			reg_entry++;
+			reg_entry = rtas_fadump_read_regs(reg_entry, &regs);
+			note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+		}
+	}
+	final_note(note_buf);
+
+	if (fdh) {
+		pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+			 fdh->elfcorehdr_addr);
+		fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+	}
+	return 0;
+
+error_out:
+	fadump_free_cpu_notes_buf();
+	return rc;
+
+}
+
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
+{
+	struct fadump_crash_info_header *fdh;
+	int rc = 0;
+
+	if (!fdm_active || !fadump_conf->fadumphdr_addr)
+		return -EINVAL;
+
+	/* Check if the dump data is valid. */
+	if ((be16_to_cpu(fdm_active->header.dump_status_flag) ==
+			RTAS_FADUMP_ERROR_FLAG) ||
+			(fdm_active->cpu_state_data.error_flags != 0) ||
+			(fdm_active->rmr_region.error_flags != 0)) {
+		pr_err("Dump taken by platform is not valid\n");
+		return -EINVAL;
+	}
+	if ((fdm_active->rmr_region.bytes_dumped !=
+			fdm_active->rmr_region.source_len) ||
+			!fdm_active->cpu_state_data.bytes_dumped) {
+		pr_err("Dump taken by platform is incomplete\n");
+		return -EINVAL;
+	}
+
+	/* Validate the fadump crash info header */
+	fdh = __va(fadump_conf->fadumphdr_addr);
+	if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+		pr_err("Crash info header is not valid.\n");
+		return -EINVAL;
+	}
+
+	rc = rtas_fadump_build_cpu_notes(fadump_conf);
+	if (rc)
+		return rc;
+
+	/*
+	 * We are done validating dump info and elfcore header is now ready
+	 * to be exported. set elfcorehdr_addr so that vmcore module will
+	 * export the elfcore header through '/proc/vmcore'.
+	 */
+	elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+	return 0;
+}
+
+static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
+				    struct seq_file *m)
+{
+	const struct rtas_fadump_section *cpu_data_section;
+	const struct rtas_fadump_mem_struct *fdm_ptr;
+
+	if (fdm_active)
+		fdm_ptr = fdm_active;
+	else
+		fdm_ptr = &fdm;
+
+	cpu_data_section = &(fdm_ptr->cpu_state_data);
+	seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+		   be64_to_cpu(cpu_data_section->destination_address),
+		   be64_to_cpu(cpu_data_section->destination_address) +
+		   be64_to_cpu(cpu_data_section->source_len) - 1,
+		   be64_to_cpu(cpu_data_section->source_len),
+		   be64_to_cpu(cpu_data_section->bytes_dumped));
+
+	seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+		   be64_to_cpu(fdm_ptr->hpte_region.destination_address),
+		   be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
+		   be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
+		   be64_to_cpu(fdm_ptr->hpte_region.source_len),
+		   be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
+
+	seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+		   be64_to_cpu(fdm_ptr->rmr_region.source_address),
+		   be64_to_cpu(fdm_ptr->rmr_region.destination_address));
+	seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+		   be64_to_cpu(fdm_ptr->rmr_region.source_len),
+		   be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
+
+	/* Dump is active. Show reserved area start address. */
+	if (fdm_active) {
+		seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+			   fadump_conf->reserve_dump_area_start);
+	}
+}
+
+static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
+				const char *msg)
+{
+	/* Call ibm,os-term rtas call to trigger firmware assisted dump */
+	rtas_os_term((char *)msg);
+}
+
+static struct fadump_ops rtas_fadump_ops = {
+	.fadump_init_mem_struct		= rtas_fadump_init_mem_struct,
+	.fadump_get_bootmem_min		= rtas_fadump_get_bootmem_min,
+	.fadump_register		= rtas_fadump_register,
+	.fadump_unregister		= rtas_fadump_unregister,
+	.fadump_invalidate		= rtas_fadump_invalidate,
+	.fadump_process			= rtas_fadump_process,
+	.fadump_region_show		= rtas_fadump_region_show,
+	.fadump_trigger			= rtas_fadump_trigger,
+};
+
+void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+	int i, size, num_sections;
+	const __be32 *sections;
+	const __be32 *token;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return;
+
+	fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token);
+	fadump_conf->ops		= &rtas_fadump_ops;
+	fadump_conf->fadump_supported	= 1;
+
+	/* Firmware supports 64-bit value for size, align it to pagesize. */
+	fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE);
+
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+	if (fdm_active) {
+		pr_info("Firmware-assisted dump is active.\n");
+		fadump_conf->dump_active = 1;
+		rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active));
+	}
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 * For each dump section type supported, a 32bit cell which defines
+	 * the ID of a supported section followed by two 32 bit cells which
+	 * gives the size of the section in bytes.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return;
+
+	num_sections = size / (3 * sizeof(u32));
+
+	for (i = 0; i < num_sections; i++, sections += 3) {
+		u32 type = (u32)of_read_number(sections, 1);
+
+		switch (type) {
+		case RTAS_FADUMP_CPU_STATE_DATA:
+			fadump_conf->cpu_state_data_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		case RTAS_FADUMP_HPTE_REGION:
+			fadump_conf->hpte_region_size =
+					of_read_ulong(&sections[1], 2);
+			break;
+		}
+	}
+}
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h
new file mode 100644
index 000000000000..fd59bd7ca9c3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWERVM platform.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _PSERIES_RTAS_FADUMP_H
+#define _PSERIES_RTAS_FADUMP_H
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need additional 64M
+ * of memory to avoid OOM issue.
+ */
+#define RTAS_FADUMP_MIN_BOOT_MEM	((0x1UL << 28) + (0x1UL << 26))
+
+/* Firmware provided dump sections */
+#define RTAS_FADUMP_CPU_STATE_DATA	0x0001
+#define RTAS_FADUMP_HPTE_REGION		0x0002
+#define RTAS_FADUMP_REAL_MODE_REGION	0x0011
+
+/* Dump request flag */
+#define RTAS_FADUMP_REQUEST_FLAG	0x00000001
+
+/* Dump status flag */
+#define RTAS_FADUMP_ERROR_FLAG		0x2000
+
+/* Kernel Dump section info */
+struct rtas_fadump_section {
+	__be32	request_flag;
+	__be16	source_data_type;
+	__be16	error_flags;
+	__be64	source_address;
+	__be64	source_len;
+	__be64	bytes_dumped;
+	__be64	destination_address;
+};
+
+/* ibm,configure-kernel-dump header. */
+struct rtas_fadump_section_header {
+	__be32	dump_format_version;
+	__be16	dump_num_sections;
+	__be16	dump_status_flag;
+	__be32	offset_first_dump_section;
+
+	/* Fields for disk dump option. */
+	__be32	dd_block_size;
+	__be64	dd_block_offset;
+	__be64	dd_num_blocks;
+	__be32	dd_offset_disk_path;
+
+	/* Maximum time allowed to prevent an automatic dump-reboot. */
+	__be32	max_time_auto;
+};
+
+/*
+ * Firmware Assisted dump memory structure. This structure is required for
+ * registering future kernel dump with power firmware through rtas call.
+ *
+ * No disk dump option. Hence disk dump path string section is not included.
+ */
+struct rtas_fadump_mem_struct {
+	struct rtas_fadump_section_header	header;
+
+	/* Kernel dump sections */
+	struct rtas_fadump_section		cpu_state_data;
+	struct rtas_fadump_section		hpte_region;
+
+	/*
+	 * TODO: Extend multiple boot memory regions support in the kernel
+	 *       for this platform.
+	 */
+	struct rtas_fadump_section		rmr_region;
+};
+
+/*
+ * The firmware-assisted dump format.
+ *
+ * The register save area is an area in the partition's memory used to preserve
+ * the register contents (CPU state data) for the active CPUs during a firmware
+ * assisted dump. The dump format contains register save area header followed
+ * by register entries. Each list of registers for a CPU starts with "CPUSTRT"
+ * and ends with "CPUEND".
+ */
+
+/* Register save area header. */
+struct rtas_fadump_reg_save_area_header {
+	__be64		magic_number;
+	__be32		version;
+	__be32		num_cpu_offset;
+};
+
+/* Register entry. */
+struct rtas_fadump_reg_entry {
+	__be64		reg_id;
+	__be64		reg_value;
+};
+
+/* Utility macros */
+#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry)				\
+({									\
+	while (be64_to_cpu(reg_entry->reg_id) !=			\
+	       fadump_str_to_u64("CPUEND"))				\
+		reg_entry++;						\
+	reg_entry++;							\
+})
+
+#define RTAS_FADUMP_CPU_ID_MASK			((1UL << 32) - 1)
+
+#endif /* _PSERIES_RTAS_FADUMP_H */
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 8fa012a65a71..f8adcd0e4589 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -38,6 +38,7 @@
 #include <linux/of.h>
 #include <linux/of_pci.h>
 #include <linux/memblock.h>
+#include <linux/swiotlb.h>
 
 #include <asm/mmu.h>
 #include <asm/processor.h>
@@ -67,6 +68,8 @@
 #include <asm/isa-bridge.h>
 #include <asm/security_features.h>
 #include <asm/asm-const.h>
+#include <asm/swiotlb.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 #include "../../../../drivers/pci/pci.h"
@@ -139,17 +142,19 @@ static void __init fwnmi_init(void)
 	}
 
 #ifdef CONFIG_PPC_BOOK3S_64
-	/* Allocate per cpu slb area to save old slb contents during MCE */
-	size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
-	slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry),
-					MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
-					NUMA_NO_NODE);
-	if (!slb_ptr)
-		panic("Failed to allocate %zu bytes below %pa for slb area\n",
-		      size, &ppc64_rma_size);
-
-	for_each_possible_cpu(i)
-		paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
+	if (!radix_enabled()) {
+		/* Allocate per cpu area to save old slb contents during MCE */
+		size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
+		slb_ptr = memblock_alloc_try_nid_raw(size,
+				sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT,
+				ppc64_rma_size, NUMA_NO_NODE);
+		if (!slb_ptr)
+			panic("Failed to allocate %zu bytes below %pa for slb area\n",
+			      size, &ppc64_rma_size);
+
+		for_each_possible_cpu(i)
+			paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
+	}
 #endif
 }
 
@@ -273,46 +278,16 @@ struct kmem_cache *dtl_cache;
  */
 static int alloc_dispatch_logs(void)
 {
-	int cpu, ret;
-	struct paca_struct *pp;
-	struct dtl_entry *dtl;
-
 	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
 		return 0;
 
 	if (!dtl_cache)
 		return 0;
 
-	for_each_possible_cpu(cpu) {
-		pp = paca_ptrs[cpu];
-		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
-		if (!dtl) {
-			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
-				cpu);
-			pr_warn("Stolen time statistics will be unreliable\n");
-			break;
-		}
-
-		pp->dtl_ridx = 0;
-		pp->dispatch_log = dtl;
-		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
-		pp->dtl_curr = dtl;
-	}
+	alloc_dtl_buffers(0);
 
 	/* Register the DTL for the current (boot) cpu */
-	dtl = get_paca()->dispatch_log;
-	get_paca()->dtl_ridx = 0;
-	get_paca()->dtl_curr = dtl;
-	get_paca()->lppaca_ptr->dtl_idx = 0;
-
-	/* hypervisor reads buffer length from this field */
-	dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
-	ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
-	if (ret)
-		pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
-		       "with %d\n", smp_processor_id(),
-		       hard_smp_processor_id(), ret);
-	get_paca()->lppaca_ptr->dtl_enable_mask = 2;
+	register_dtl_buffer(smp_processor_id());
 
 	return 0;
 }
@@ -325,8 +300,10 @@ static inline int alloc_dispatch_logs(void)
 
 static int alloc_dispatch_log_kmem_cache(void)
 {
+	void (*ctor)(void *) = get_dtl_cache_ctor();
+
 	dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
-						DISPATCH_LOG_BYTES, 0, NULL);
+						DISPATCH_LOG_BYTES, 0, ctor);
 	if (!dtl_cache) {
 		pr_warn("Failed to create dispatch trace log buffer cache\n");
 		pr_warn("Stolen time statistics will be unreliable\n");
@@ -344,6 +321,9 @@ static void pseries_lpar_idle(void)
 	 * low power mode by ceding processor to hypervisor
 	 */
 
+	if (!prep_irq_for_idle())
+		return;
+
 	/* Indicate to hypervisor that we are idle. */
 	get_lppaca()->idle = 1;
 
@@ -793,6 +773,9 @@ static void __init pSeries_setup_arch(void)
 	}
 
 	ppc_md.pcibios_root_bridge_prepare = pseries_root_bridge_prepare;
+
+	if (swiotlb_force == SWIOTLB_FORCE)
+		ppc_swiotlb_enable = 1;
 }
 
 static void pseries_panic(char *str)
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 4b3ef8d9c63f..ad61e90032da 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -41,6 +41,7 @@
 #include <asm/dbell.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/code-patching.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 #include "offline_states.h"
@@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void)
 {
 	xics_smp_probe();
 
-	if (cpu_has_feature(CPU_FTR_DBELL))
+	if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest())
 		smp_ops->cause_ipi = smp_pseries_cause_ipi;
 	else
 		smp_ops->cause_ipi = icp_ops->cause_ipi;
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
new file mode 100644
index 000000000000..40c0637203d5
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Secure VM platform
+ *
+ * Copyright 2018 IBM Corporation
+ * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/svm.h>
+#include <asm/swiotlb.h>
+#include <asm/ultravisor.h>
+
+static int __init init_svm(void)
+{
+	if (!is_secure_guest())
+		return 0;
+
+	/* Don't release the SWIOTLB buffer. */
+	ppc_swiotlb_enable = 1;
+
+	/*
+	 * Since the guest memory is inaccessible to the host, devices always
+	 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
+	 * otherwise.
+	 */
+	swiotlb_force = SWIOTLB_FORCE;
+
+	/* Share the SWIOTLB buffer with the host. */
+	swiotlb_update_mem_attributes();
+
+	return 0;
+}
+machine_early_initcall(pseries, init_svm);
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	uv_unshare_page(PHYS_PFN(__pa(addr)), numpages);
+
+	return 0;
+}
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	uv_share_page(PHYS_PFN(__pa(addr)), numpages);
+
+	return 0;
+}
+
+/* There's one dispatch log per CPU. */
+#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
+
+static struct page *dtl_page_store[NR_DTL_PAGE];
+static long dtl_nr_pages;
+
+static bool is_dtl_page_shared(struct page *page)
+{
+	long i;
+
+	for (i = 0; i < dtl_nr_pages; i++)
+		if (dtl_page_store[i] == page)
+			return true;
+
+	return false;
+}
+
+void dtl_cache_ctor(void *addr)
+{
+	unsigned long pfn = PHYS_PFN(__pa(addr));
+	struct page *page = pfn_to_page(pfn);
+
+	if (!is_dtl_page_shared(page)) {
+		dtl_page_store[dtl_nr_pages] = page;
+		dtl_nr_pages++;
+		WARN_ON(dtl_nr_pages >= NR_DTL_PAGE);
+		uv_share_page(pfn, 1);
+	}
+}
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index ba758f4be328..79e2287991db 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -520,7 +520,7 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
 
 	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))))
 		goto out_fail;
-	ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev),
+	ret = iommu_map_page(dev, tbl, page, offset, size, dma_get_mask(dev),
 			direction, attrs);
 	if (unlikely(ret == DMA_MAPPING_ERROR))
 		goto out_deallocate;
@@ -560,7 +560,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 
 	if (vio_cmo_alloc(viodev, alloc_size))
 		goto out_fail;
-	ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, device_to_mask(dev),
+	ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, dma_get_mask(dev),
 			direction, attrs);
 	if (unlikely(!ret))
 		goto out_deallocate;
@@ -605,6 +605,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = {
 	.unmap_page        = vio_dma_iommu_unmap_page,
 	.dma_supported     = dma_iommu_dma_supported,
 	.get_required_mask = dma_iommu_get_required_mask,
+	.mmap		   = dma_common_mmap,
+	.get_sgtable	   = dma_common_get_sgtable,
 };
 
 /**
@@ -1191,7 +1193,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 	else
 		tbl->it_ops = &iommu_table_pseries_ops;
 
-	return iommu_init_table(tbl, -1);
+	return iommu_init_table(tbl, -1, 0, 0);
 }
 
 /**
diff --git a/arch/powerpc/platforms/pseries/vphn.c b/arch/powerpc/platforms/pseries/vphn.c
new file mode 100644
index 000000000000..3f07bf6c670e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/vphn.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/byteorder.h>
+#include <asm/lppaca.h>
+
+/*
+ * The associativity domain numbers are returned from the hypervisor as a
+ * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
+ * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
+ * bytes.
+ *
+ *    --- 16-bit fields -->
+ *  _________________________
+ *  |  0  |  1  |  2  |  3  |   be_packed[0]
+ *  ------+-----+-----+------
+ *  _________________________
+ *  |  4  |  5  |  6  |  7  |   be_packed[1]
+ *  -------------------------
+ *            ...
+ *  _________________________
+ *  | 20  | 21  | 22  | 23  |   be_packed[5]
+ *  -------------------------
+ *
+ * Convert to the sequence they would appear in the ibm,associativity property.
+ */
+static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
+{
+	__be64 be_packed[VPHN_REGISTER_COUNT];
+	int i, nr_assoc_doms = 0;
+	const __be16 *field = (const __be16 *) be_packed;
+	u16 last = 0;
+	bool is_32bit = false;
+
+#define VPHN_FIELD_UNUSED	(0xffff)
+#define VPHN_FIELD_MSB		(0x8000)
+#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)
+
+	/* Let's fix the values returned by plpar_hcall9() */
+	for (i = 0; i < VPHN_REGISTER_COUNT; i++)
+		be_packed[i] = cpu_to_be64(packed[i]);
+
+	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
+		u16 new = be16_to_cpup(field++);
+
+		if (is_32bit) {
+			/*
+			 * Let's concatenate the 16 bits of this field to the
+			 * 15 lower bits of the previous field
+			 */
+			unpacked[++nr_assoc_doms] =
+				cpu_to_be32(last << 16 | new);
+			is_32bit = false;
+		} else if (new == VPHN_FIELD_UNUSED)
+			/* This is the list terminator */
+			break;
+		else if (new & VPHN_FIELD_MSB) {
+			/* Data is in the lower 15 bits of this field */
+			unpacked[++nr_assoc_doms] =
+				cpu_to_be32(new & VPHN_FIELD_MASK);
+		} else {
+			/*
+			 * Data is in the lower 15 bits of this field
+			 * concatenated with the next 16 bit field
+			 */
+			last = new;
+			is_32bit = true;
+		}
+	}
+
+	/* The first cell contains the length of the property */
+	unpacked[0] = cpu_to_be32(nr_assoc_doms);
+
+	return nr_assoc_doms;
+}
+
+/* NOTE: This file is included by a selftest and built in userspace. */
+#ifdef __KERNEL__
+#include <asm/hvcall.h>
+
+long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity)
+{
+	long rc;
+	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
+
+	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, cpu);
+	vphn_unpack_associativity(retbuf, associativity);
+
+	return rc;
+}
+#endif