aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms')
-rw-r--r--arch/powerpc/platforms/44x/Kconfig8
-rw-r--r--arch/powerpc/platforms/4xx/Makefile1
-rw-r--r--arch/powerpc/platforms/4xx/ocm.c390
-rw-r--r--arch/powerpc/platforms/Kconfig3
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype16
-rw-r--r--arch/powerpc/platforms/cell/iommu.c2
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c208
-rw-r--r--arch/powerpc/platforms/pasemi/iommu.c2
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig5
-rw-r--r--arch/powerpc/platforms/powernv/Makefile6
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c97
-rw-r--r--arch/powerpc/platforms/powernv/idle.c6
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c101
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c5
-rw-r--r--arch/powerpc/platforms/powernv/opal-core.c636
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.c716
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.h146
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c12
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c57
-rw-r--r--arch/powerpc/platforms/powernv/opal-prd.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c213
-rw-r--r--arch/powerpc/platforms/powernv/opal.c42
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c38
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c98
-rw-r--r--arch/powerpc/platforms/powernv/pci.c3
-rw-r--r--arch/powerpc/platforms/powernv/pci.h2
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h5
-rw-r--r--arch/powerpc/platforms/powernv/setup.c9
-rw-r--r--arch/powerpc/platforms/powernv/smp.c2
-rw-r--r--arch/powerpc/platforms/powernv/ultravisor.c69
-rw-r--r--arch/powerpc/platforms/ps3/spu.c10
-rw-r--r--arch/powerpc/platforms/ps3/system-bus.c11
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig14
-rw-r--r--arch/powerpc/platforms/pseries/Makefile2
-rw-r--r--arch/powerpc/platforms/pseries/eeh_pseries.c68
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c26
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c24
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c186
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c9
-rw-r--r--arch/powerpc/platforms/pseries/papr_scm.c72
-rw-r--r--arch/powerpc/platforms/pseries/pci.c3
-rw-r--r--arch/powerpc/platforms/pseries/pseries.h1
-rw-r--r--arch/powerpc/platforms/pseries/ras.c460
-rw-r--r--arch/powerpc/platforms/pseries/rtas-fadump.c550
-rw-r--r--arch/powerpc/platforms/pseries/rtas-fadump.h114
-rw-r--r--arch/powerpc/platforms/pseries/setup.c33
-rw-r--r--arch/powerpc/platforms/pseries/smp.c3
-rw-r--r--arch/powerpc/platforms/pseries/svm.c85
-rw-r--r--arch/powerpc/platforms/pseries/vio.c4
49 files changed, 3396 insertions, 1185 deletions
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index b369ed4e3675..25ebe634a661 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -272,14 +272,6 @@ config PPC4xx_GPIO
help
Enable gpiolib support for ppc440 based boards
-config PPC4xx_OCM
- bool "PPC4xx On Chip Memory (OCM) support"
- depends on 4xx
- select PPC_LIB_RHEAP
- help
- Enable OCM support for PowerPC 4xx platforms with on chip memory,
- OCM provides the fast place for memory access to improve performance.
-
# 44x specific CPU modules, selected based on the board above.
config 440EP
bool
diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile
index f5ae27ca131b..d009d2e0b9e8 100644
--- a/arch/powerpc/platforms/4xx/Makefile
+++ b/arch/powerpc/platforms/4xx/Makefile
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y += uic.o machine_check.o
-obj-$(CONFIG_PPC4xx_OCM) += ocm.o
obj-$(CONFIG_4xx_SOC) += soc.o
obj-$(CONFIG_PCI) += pci.o
obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o
diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c
deleted file mode 100644
index ba3257406ced..000000000000
--- a/arch/powerpc/platforms/4xx/ocm.c
+++ /dev/null
@@ -1,390 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * PowerPC 4xx OCM memory allocation support
- *
- * (C) Copyright 2009, Applied Micro Circuits Corporation
- * Victor Gallardo (vgallardo@amcc.com)
- *
- * See file CREDITS for list of people who contributed to this
- * project.
- */
-
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <asm/rheap.h>
-#include <asm/ppc4xx_ocm.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-
-#define OCM_DISABLED 0
-#define OCM_ENABLED 1
-
-struct ocm_block {
- struct list_head list;
- void __iomem *addr;
- int size;
- const char *owner;
-};
-
-/* non-cached or cached region */
-struct ocm_region {
- phys_addr_t phys;
- void __iomem *virt;
-
- int memtotal;
- int memfree;
-
- rh_info_t *rh;
- struct list_head list;
-};
-
-struct ocm_info {
- int index;
- int status;
- int ready;
-
- phys_addr_t phys;
-
- int alignment;
- int memtotal;
- int cache_size;
-
- struct ocm_region nc; /* non-cached region */
- struct ocm_region c; /* cached region */
-};
-
-static struct ocm_info *ocm_nodes;
-static int ocm_count;
-
-static struct ocm_info *ocm_get_node(unsigned int index)
-{
- if (index >= ocm_count) {
- printk(KERN_ERR "PPC4XX OCM: invalid index");
- return NULL;
- }
-
- return &ocm_nodes[index];
-}
-
-static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr)
-{
- struct ocm_block *blk, *tmp;
- unsigned long offset;
-
- if (!ocm_reg->virt)
- return 0;
-
- list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) {
- if (blk->addr == addr) {
- offset = addr - ocm_reg->virt;
- ocm_reg->memfree += blk->size;
- rh_free(ocm_reg->rh, offset);
- list_del(&blk->list);
- kfree(blk);
- return 1;
- }
- }
-
- return 0;
-}
-
-static void __init ocm_init_node(int count, struct device_node *node)
-{
- struct ocm_info *ocm;
-
- const unsigned int *cell_index;
- const unsigned int *cache_size;
- int len;
-
- struct resource rsrc;
-
- ocm = ocm_get_node(count);
-
- cell_index = of_get_property(node, "cell-index", &len);
- if (!cell_index) {
- printk(KERN_ERR "PPC4XX OCM: missing cell-index property");
- return;
- }
- ocm->index = *cell_index;
-
- if (of_device_is_available(node))
- ocm->status = OCM_ENABLED;
-
- cache_size = of_get_property(node, "cached-region-size", &len);
- if (cache_size)
- ocm->cache_size = *cache_size;
-
- if (of_address_to_resource(node, 0, &rsrc)) {
- printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n",
- ocm->index);
- return;
- }
-
- ocm->phys = rsrc.start;
- ocm->memtotal = (rsrc.end - rsrc.start + 1);
-
- printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n",
- ocm->index, ocm->memtotal,
- (ocm->status == OCM_DISABLED) ? "disabled" : "enabled");
-
- if (ocm->status == OCM_DISABLED)
- return;
-
- /* request region */
-
- if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) {
- printk(KERN_ERR "PPC4XX OCM%d: could not request region\n",
- ocm->index);
- return;
- }
-
- /* Configure non-cached and cached regions */
-
- ocm->nc.phys = ocm->phys;
- ocm->nc.memtotal = ocm->memtotal - ocm->cache_size;
- ocm->nc.memfree = ocm->nc.memtotal;
-
- ocm->c.phys = ocm->phys + ocm->nc.memtotal;
- ocm->c.memtotal = ocm->cache_size;
- ocm->c.memfree = ocm->c.memtotal;
-
- if (ocm->nc.memtotal == 0)
- ocm->nc.phys = 0;
-
- if (ocm->c.memtotal == 0)
- ocm->c.phys = 0;
-
- printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n",
- ocm->index, ocm->nc.memtotal);
-
- printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n",
- ocm->index, ocm->c.memtotal);
-
- /* ioremap the non-cached region */
- if (ocm->nc.memtotal) {
- ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal,
- _PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG));
-
- if (!ocm->nc.virt) {
- printk(KERN_ERR
- "PPC4XX OCM%d: failed to ioremap non-cached memory\n",
- ocm->index);
- ocm->nc.memfree = 0;
- return;
- }
- }
-
- /* ioremap the cached region */
-
- if (ocm->c.memtotal) {
- ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal,
- _PAGE_EXEC | pgprot_val(PAGE_KERNEL));
-
- if (!ocm->c.virt) {
- printk(KERN_ERR
- "PPC4XX OCM%d: failed to ioremap cached memory\n",
- ocm->index);
- ocm->c.memfree = 0;
- return;
- }
- }
-
- /* Create Remote Heaps */
-
- ocm->alignment = 4; /* default 4 byte alignment */
-
- if (ocm->nc.virt) {
- ocm->nc.rh = rh_create(ocm->alignment);
- rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal);
- }
-
- if (ocm->c.virt) {
- ocm->c.rh = rh_create(ocm->alignment);
- rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal);
- }
-
- INIT_LIST_HEAD(&ocm->nc.list);
- INIT_LIST_HEAD(&ocm->c.list);
-
- ocm->ready = 1;
-}
-
-static int ocm_debugfs_show(struct seq_file *m, void *v)
-{
- struct ocm_block *blk, *tmp;
- unsigned int i;
-
- for (i = 0; i < ocm_count; i++) {
- struct ocm_info *ocm = ocm_get_node(i);
-
- if (!ocm || !ocm->ready)
- continue;
-
- seq_printf(m, "PPC4XX OCM : %d\n", ocm->index);
- seq_printf(m, "PhysAddr : %pa\n", &(ocm->phys));
- seq_printf(m, "MemTotal : %d Bytes\n", ocm->memtotal);
- seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal);
- seq_printf(m, "MemTotal(C) : %d Bytes\n\n", ocm->c.memtotal);
-
- seq_printf(m, "NC.PhysAddr : %pa\n", &(ocm->nc.phys));
- seq_printf(m, "NC.VirtAddr : 0x%p\n", ocm->nc.virt);
- seq_printf(m, "NC.MemTotal : %d Bytes\n", ocm->nc.memtotal);
- seq_printf(m, "NC.MemFree : %d Bytes\n", ocm->nc.memfree);
-
- list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) {
- seq_printf(m, "NC.MemUsed : %d Bytes (%s)\n",
- blk->size, blk->owner);
- }
-
- seq_printf(m, "\nC.PhysAddr : %pa\n", &(ocm->c.phys));
- seq_printf(m, "C.VirtAddr : 0x%p\n", ocm->c.virt);
- seq_printf(m, "C.MemTotal : %d Bytes\n", ocm->c.memtotal);
- seq_printf(m, "C.MemFree : %d Bytes\n", ocm->c.memfree);
-
- list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) {
- seq_printf(m, "C.MemUsed : %d Bytes (%s)\n",
- blk->size, blk->owner);
- }
-
- seq_putc(m, '\n');
- }
-
- return 0;
-}
-
-static int ocm_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ocm_debugfs_show, NULL);
-}
-
-static const struct file_operations ocm_debugfs_fops = {
- .open = ocm_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int ocm_debugfs_init(void)
-{
- struct dentry *junk;
-
- junk = debugfs_create_dir("ppc4xx_ocm", 0);
- if (!junk) {
- printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n");
- return -1;
- }
-
- if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) {
- printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n");
- return -1;
- }
-
- return 0;
-}
-
-void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
- int flags, const char *owner)
-{
- void __iomem *addr = NULL;
- unsigned long offset;
- struct ocm_info *ocm;
- struct ocm_region *ocm_reg;
- struct ocm_block *ocm_blk;
- int i;
-
- for (i = 0; i < ocm_count; i++) {
- ocm = ocm_get_node(i);
-
- if (!ocm || !ocm->ready)
- continue;
-
- if (flags == PPC4XX_OCM_NON_CACHED)
- ocm_reg = &ocm->nc;
- else
- ocm_reg = &ocm->c;
-
- if (!ocm_reg->virt)
- continue;
-
- if (align < ocm->alignment)
- align = ocm->alignment;
-
- offset = rh_alloc_align(ocm_reg->rh, size, align, NULL);
-
- if (IS_ERR_VALUE(offset))
- continue;
-
- ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL);
- if (!ocm_blk) {
- rh_free(ocm_reg->rh, offset);
- break;
- }
-
- *phys = ocm_reg->phys + offset;
- addr = ocm_reg->virt + offset;
- size = ALIGN(size, align);
-
- ocm_blk->addr = addr;
- ocm_blk->size = size;
- ocm_blk->owner = owner;
- list_add_tail(&ocm_blk->list, &ocm_reg->list);
-
- ocm_reg->memfree -= size;
-
- break;
- }
-
- return addr;
-}
-
-void ppc4xx_ocm_free(const void *addr)
-{
- int i;
-
- if (!addr)
- return;
-
- for (i = 0; i < ocm_count; i++) {
- struct ocm_info *ocm = ocm_get_node(i);
-
- if (!ocm || !ocm->ready)
- continue;
-
- if (ocm_free_region(&ocm->nc, addr) ||
- ocm_free_region(&ocm->c, addr))
- return;
- }
-}
-
-static int __init ppc4xx_ocm_init(void)
-{
- struct device_node *np;
- int count;
-
- count = 0;
- for_each_compatible_node(np, NULL, "ibm,ocm")
- count++;
-
- if (!count)
- return 0;
-
- ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL);
- if (!ocm_nodes)
- return -ENOMEM;
-
- ocm_count = count;
- count = 0;
-
- for_each_compatible_node(np, NULL, "ibm,ocm") {
- ocm_init_node(count, np);
- count++;
- }
-
- ocm_debugfs_init();
-
- return 0;
-}
-
-arch_initcall(ppc4xx_ocm_init);
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index f3fb79fccc72..d82e3664ffdf 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -197,7 +197,8 @@ endmenu
config PPC601_SYNC_FIX
bool "Workarounds for PPC601 bugs"
- depends on PPC_BOOK3S_32 && PPC_PMAC
+ depends on PPC_BOOK3S_601 && PPC_PMAC
+ default y
help
Some versions of the PPC601 (the first PowerPC chip) have bugs which
mean that extra synchronization instructions are required near
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 56a7c814160d..12543e53fa96 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -6,6 +6,9 @@ config PPC64
This option selects whether a 32-bit or a 64-bit kernel
will be built.
+config PPC_BOOK3S_32
+ bool
+
menu "Processor support"
choice
prompt "Processor Type"
@@ -21,13 +24,20 @@ choice
If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx.
-config PPC_BOOK3S_32
- bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx"
+config PPC_BOOK3S_6xx
+ bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601"
+ select PPC_BOOK3S_32
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
+config PPC_BOOK3S_601
+ bool "PowerPC 601"
+ select PPC_BOOK3S_32
+ select PPC_FPU
+ select PPC_HAVE_KUAP
+
config PPC_85xx
bool "Freescale 85xx"
select E500
@@ -450,8 +460,10 @@ config NOT_COHERENT_CACHE
depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
GAMECUBE_COMMON || AMIGAONE
select ARCH_HAS_DMA_COHERENT_TO_PFN
+ select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select ARCH_HAS_SYNC_DMA_FOR_CPU
+ select DMA_DIRECT_REMAP
default n if PPC_47x
default y
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 16dfee29aa41..ca9ffc1c8685 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
window->table.it_size = size >> window->table.it_page_shift;
window->table.it_ops = &cell_iommu_ops;
- iommu_init_table(&window->table, iommu->nid);
+ iommu_init_table(&window->table, iommu->nid, 0, 0);
pr_debug("\tioid %d\n", window->ioid);
pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 065ff14b76e1..2dd452a047cd 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -10,6 +10,8 @@
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/fsnotify.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
@@ -20,7 +22,6 @@
#include <linux/pagemap.h>
#include <linux/poll.h>
#include <linux/slab.h>
-#include <linux/parser.h>
#include <asm/prom.h>
#include <asm/spu.h>
@@ -30,7 +31,7 @@
#include "spufs.h"
struct spufs_sb_info {
- int debug;
+ bool debug;
};
static struct kmem_cache *spufs_inode_cache;
@@ -574,16 +575,27 @@ long spufs_create(struct path *path, struct dentry *dentry,
}
/* File system initialization */
+struct spufs_fs_context {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+};
+
enum {
- Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err,
+ Opt_uid, Opt_gid, Opt_mode, Opt_debug,
+};
+
+static const struct fs_parameter_spec spufs_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("debug", Opt_debug),
+ {}
};
-static const match_table_t spufs_tokens = {
- { Opt_uid, "uid=%d" },
- { Opt_gid, "gid=%d" },
- { Opt_mode, "mode=%o" },
- { Opt_debug, "debug" },
- { Opt_err, NULL },
+static const struct fs_parameter_description spufs_fs_parameters = {
+ .name = "spufs",
+ .specs = spufs_param_specs,
};
static int spufs_show_options(struct seq_file *m, struct dentry *root)
@@ -604,47 +616,41 @@ static int spufs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static int
-spufs_parse_options(struct super_block *sb, char *options, struct inode *root)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
-
- if (!*p)
- continue;
-
- token = match_token(p, spufs_tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return 0;
- root->i_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(root->i_uid))
- return 0;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- root->i_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(root->i_gid))
- return 0;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return 0;
- root->i_mode = option | S_IFDIR;
- break;
- case Opt_debug:
- spufs_get_sb_info(sb)->debug = 1;
- break;
- default:
- return 0;
- }
+static int spufs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct spufs_fs_context *ctx = fc->fs_private;
+ struct spufs_sb_info *sbi = fc->s_fs_info;
+ struct fs_parse_result result;
+ kuid_t uid;
+ kgid_t gid;
+ int opt;
+
+ opt = fs_parse(fc, &spufs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return invalf(fc, "Unknown uid");
+ ctx->uid = uid;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return invalf(fc, "Unknown gid");
+ ctx->gid = gid;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_debug:
+ sbi->debug = true;
+ break;
}
- return 1;
+
+ return 0;
}
static void spufs_exit_isolated_loader(void)
@@ -678,79 +684,99 @@ spufs_init_isolated_loader(void)
printk(KERN_INFO "spufs: SPU isolation mode enabled\n");
}
-static int
-spufs_create_root(struct super_block *sb, void *data)
+static int spufs_create_root(struct super_block *sb, struct fs_context *fc)
{
+ struct spufs_fs_context *ctx = fc->fs_private;
struct inode *inode;
- int ret;
- ret = -ENODEV;
if (!spu_management_ops)
- goto out;
+ return -ENODEV;
- ret = -ENOMEM;
- inode = spufs_new_inode(sb, S_IFDIR | 0775);
+ inode = spufs_new_inode(sb, S_IFDIR | ctx->mode);
if (!inode)
- goto out;
+ return -ENOMEM;
+ inode->i_uid = ctx->uid;
+ inode->i_gid = ctx->gid;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
SPUFS_I(inode)->i_ctx = NULL;
inc_nlink(inode);
- ret = -EINVAL;
- if (!spufs_parse_options(sb, data, inode))
- goto out_iput;
-
- ret = -ENOMEM;
sb->s_root = d_make_root(inode);
if (!sb->s_root)
- goto out;
-
+ return -ENOMEM;
return 0;
-out_iput:
- iput(inode);
-out:
- return ret;
}
-static int
-spufs_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct spufs_sb_info *info;
- static const struct super_operations s_ops = {
- .alloc_inode = spufs_alloc_inode,
- .free_inode = spufs_free_inode,
- .statfs = simple_statfs,
- .evict_inode = spufs_evict_inode,
- .show_options = spufs_show_options,
- };
-
- info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
- return -ENOMEM;
+static const struct super_operations spufs_ops = {
+ .alloc_inode = spufs_alloc_inode,
+ .free_inode = spufs_free_inode,
+ .statfs = simple_statfs,
+ .evict_inode = spufs_evict_inode,
+ .show_options = spufs_show_options,
+};
+static int spufs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = SPUFS_MAGIC;
- sb->s_op = &s_ops;
- sb->s_fs_info = info;
+ sb->s_op = &spufs_ops;
- return spufs_create_root(sb, data);
+ return spufs_create_root(sb, fc);
+}
+
+static int spufs_get_tree(struct fs_context *fc)
+{
+ return get_tree_single(fc, spufs_fill_super);
}
-static struct dentry *
-spufs_mount(struct file_system_type *fstype, int flags,
- const char *name, void *data)
+static void spufs_free_fc(struct fs_context *fc)
{
- return mount_single(fstype, flags, data, spufs_fill_super);
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations spufs_context_ops = {
+ .free = spufs_free_fc,
+ .parse_param = spufs_parse_param,
+ .get_tree = spufs_get_tree,
+};
+
+static int spufs_init_fs_context(struct fs_context *fc)
+{
+ struct spufs_fs_context *ctx;
+ struct spufs_sb_info *sbi;
+
+ ctx = kzalloc(sizeof(struct spufs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ goto nomem;
+
+ sbi = kzalloc(sizeof(struct spufs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ goto nomem_ctx;
+
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->mode = 0755;
+
+ fc->fs_private = ctx;
+ fc->s_fs_info = sbi;
+ fc->ops = &spufs_context_ops;
+ return 0;
+
+nomem_ctx:
+ kfree(ctx);
+nomem:
+ return -ENOMEM;
}
static struct file_system_type spufs_type = {
.owner = THIS_MODULE,
.name = "spufs",
- .mount = spufs_mount,
+ .init_fs_context = spufs_init_fs_context,
+ .parameters = &spufs_fs_parameters,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("spufs");
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 77fee09104f8..b500a6e47e6b 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void)
*/
iommu_table_iobmap.it_blocksize = 4;
iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
- iommu_init_table(&iommu_table_iobmap, 0);
+ iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
pr_debug(" <- %s\n", __func__);
}
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 850eee860cf2..938803eab0ad 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -12,7 +12,6 @@ config PPC_POWERNV
select EPAPR_BOOT
select PPC_INDIRECT_PIO
select PPC_UDBG_16550
- select PPC_SCOM
select ARCH_RANDOM
select CPU_FREQ
select PPC_DOORBELL
@@ -47,3 +46,7 @@ config PPC_VAS
VAS adapters are found in POWER9 based systems.
If unsure, say N.
+
+config SCOM_DEBUGFS
+ bool "Expose SCOM controllers via debugfs"
+ depends on DEBUG_FS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index da2e99efbd04..a3ac9646119d 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,15 +4,19 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+obj-y += ultravisor.o
obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_FA_DUMP) += opal-fadump.o
+obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
+obj-$(CONFIG_OPAL_CORE) += opal-core.o
obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
obj-$(CONFIG_CXL_BASE) += pci-cxl.o
obj-$(CONFIG_EEH) += eeh-powernv.o
-obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
obj-$(CONFIG_OPAL_PRD) += opal-prd.o
obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
obj-$(CONFIG_OCXL_BASE) += ocxl.o
+obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 620a986209f5..6bc24a47e9ef 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -34,6 +34,7 @@
#include "powernv.h"
#include "pci.h"
+#include "../../../../drivers/pci/pci.h"
static int eeh_event_irq = -EINVAL;
@@ -41,13 +42,10 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
{
struct pci_dn *pdn = pci_get_pdn(pdev);
- if (!pdev->is_virtfn)
+ if (eeh_has_flag(EEH_FORCE_DISABLED))
return;
- /*
- * The following operations will fail if VF's sysfs files
- * aren't created or its resources aren't finalized.
- */
+ dev_dbg(&pdev->dev, "EEH: Setting up device\n");
eeh_add_device_early(pdn);
eeh_add_device_late(pdev);
eeh_sysfs_add_device(pdev);
@@ -199,6 +197,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
#endif /* CONFIG_DEBUG_FS */
+void pnv_eeh_enable_phbs(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+ /*
+ * If EEH is enabled, we're going to rely on that.
+ * Otherwise, we restore to conventional mechanism
+ * to clear frozen PE during PCI config access.
+ */
+ if (eeh_enabled())
+ phb->flags |= PNV_PHB_FLAG_EEH;
+ else
+ phb->flags &= ~PNV_PHB_FLAG_EEH;
+ }
+}
+
/**
* pnv_eeh_post_init - EEH platform dependent post initialization
*
@@ -213,9 +230,7 @@ int pnv_eeh_post_init(void)
struct pnv_phb *phb;
int ret = 0;
- /* Probe devices & build address cache */
- eeh_probe_devices();
- eeh_addr_cache_build();
+ eeh_show_enabled();
/* Register OPAL event notifier */
eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
@@ -237,19 +252,11 @@ int pnv_eeh_post_init(void)
if (!eeh_enabled())
disable_irq(eeh_event_irq);
+ pnv_eeh_enable_phbs();
+
list_for_each_entry(hose, &hose_list, list_node) {
phb = hose->private_data;
- /*
- * If EEH is enabled, we're going to rely on that.
- * Otherwise, we restore to conventional mechanism
- * to clear frozen PE during PCI config access.
- */
- if (eeh_enabled())
- phb->flags |= PNV_PHB_FLAG_EEH;
- else
- phb->flags &= ~PNV_PHB_FLAG_EEH;
-
/* Create debugfs entries */
#ifdef CONFIG_DEBUG_FS
if (phb->has_dbgfs || !phb->dbgfs)
@@ -377,6 +384,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
+ eeh_edev_dbg(edev, "Probing device\n");
+
/* Initialize eeh device */
edev->class_code = pdn->class_code;
edev->mode &= 0xFFFFFF00;
@@ -402,9 +411,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
/* Create PE */
ret = eeh_add_to_parent_pe(edev);
if (ret) {
- pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n",
- __func__, hose->global_number, pdn->busno,
- PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
+ eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
return NULL;
}
@@ -453,11 +460,17 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
* Enable EEH explicitly so that we will do EEH check
* while accessing I/O stuff
*/
- eeh_add_flag(EEH_ENABLED);
+ if (!eeh_has_flag(EEH_ENABLED)) {
+ enable_irq(eeh_event_irq);
+ pnv_eeh_enable_phbs();
+ eeh_add_flag(EEH_ENABLED);
+ }
/* Save memory bars */
eeh_save_bars(edev);
+ eeh_edev_dbg(edev, "EEH enabled on device\n");
+
return NULL;
}
@@ -837,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
int aer = edev ? edev->aer_cap : 0;
u32 ctrl;
- pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+ pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
__func__, pci_domain_nr(dev->bus),
dev->bus->number, option);
@@ -895,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
return __pnv_eeh_bridge_reset(pdev, option);
+ pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
+ __func__, pci_domain_nr(pdev->bus),
+ pdev->bus->number, option);
+
switch (option) {
case EEH_RESET_FUNDAMENTAL:
scope = OPAL_RESET_PCI_FUNDAMENTAL;
@@ -1113,17 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
+ if (pci_is_root_bus(bus))
+ return pnv_eeh_root_reset(hose, option);
+
/*
- * If dealing with the root bus (or the bus underneath the
- * root port), we reset the bus underneath the root port.
+ * For hot resets try use the generic PCI error recovery reset
+ * functions. These correctly handles the case where the secondary
+ * bus is behind a hotplug slot and it will use the slot provided
+ * reset methods to prevent spurious hotplug events during the reset.
*
- * The cxl driver depends on this behaviour for bi-modal card
- * switching.
+ * Fundemental resets need to be handled internally to EEH since the
+ * PCI core doesn't really have a concept of a fundemental reset,
+ * mainly because there's no standard way to generate one. Only a
+ * few devices require an FRESET so it should be fine.
*/
- if (pci_is_root_bus(bus) ||
- pci_is_root_bus(bus->parent))
- return pnv_eeh_root_reset(hose, option);
+ if (option != EEH_RESET_FUNDAMENTAL) {
+ /*
+ * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
+ * de-assert step. It's like the OPAL reset API was
+ * poorly designed or something...
+ */
+ if (option == EEH_RESET_DEACTIVATE)
+ return 0;
+ rc = pci_bus_error_reset(bus->self);
+ if (!rc)
+ return 0;
+ }
+
+ /* otherwise, use the generic bridge reset. this might call into FW */
+ if (pci_is_root_bus(bus->parent))
+ return pnv_eeh_root_reset(hose, option);
return pnv_eeh_bridge_reset(bus->self, option);
}
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 09f49eed7fb8..78599bca66c2 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
sprs.ptcr = mfspr(SPRN_PTCR);
sprs.rpr = mfspr(SPRN_RPR);
sprs.tscr = mfspr(SPRN_TSCR);
- sprs.ldbar = mfspr(SPRN_LDBAR);
+ if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+ sprs.ldbar = mfspr(SPRN_LDBAR);
sprs_saved = true;
@@ -789,7 +790,8 @@ core_woken:
mtspr(SPRN_MMCR0, sprs.mmcr0);
mtspr(SPRN_MMCR1, sprs.mmcr1);
mtspr(SPRN_MMCR2, sprs.mmcr2);
- mtspr(SPRN_LDBAR, sprs.ldbar);
+ if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+ mtspr(SPRN_LDBAR, sprs.ldbar);
mtspr(SPRN_SPRG3, local_paca->sprg_vdso);
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index c16249d251f1..b95b9e3c4c98 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
}
EXPORT_SYMBOL(pnv_pci_get_npu_dev);
+#ifdef CONFIG_IOMMU_API
/*
* Returns the PE assoicated with the PCI device of the given
* NPU. Returns the linked pci device if pci_dev != NULL.
@@ -192,106 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
return 0;
}
-/*
- * Enables 32 bit DMA on NPU.
- */
-static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
-{
- struct pci_dev *gpdev;
- struct pnv_ioda_pe *gpe;
- int64_t rc;
-
- /*
- * Find the assoicated PCI devices and get the dma window
- * information from there.
- */
- if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
- return;
-
- gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (!gpe)
- return;
-
- rc = pnv_npu_set_window(&npe->table_group, 0,
- gpe->table_group.tables[0]);
-
- /*
- * NVLink devices use the same TCE table configuration as
- * their parent device so drivers shouldn't be doing DMA
- * operations directly on these devices.
- */
- set_dma_ops(&npe->pdev->dev, &dma_dummy_ops);
-}
-
-/*
- * Enables bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicitly enabled or
- * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
- * active at the same time.
- */
-static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
-{
- struct pnv_phb *phb = npe->phb;
- int64_t rc = 0;
- phys_addr_t top = memblock_end_of_DRAM();
-
- if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
- return -EINVAL;
-
- rc = pnv_npu_unset_window(&npe->table_group, 0);
- if (rc != OPAL_SUCCESS)
- return rc;
-
- /* Enable the bypass window */
-
- top = roundup_pow_of_two(top);
- dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
- npe->pe_number);
- rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
- npe->pe_number, npe->pe_number,
- 0 /* bypass base */, top);
-
- if (rc == OPAL_SUCCESS)
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- return rc;
-}
-
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
-{
- int i;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- struct pnv_ioda_pe *npe;
- struct pci_dev *npdev;
-
- for (i = 0; ; ++i) {
- npdev = pnv_pci_get_npu_dev(gpdev, i);
-
- if (!npdev)
- break;
-
- pdn = pci_get_pdn(npdev);
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return;
-
- phb = pci_bus_to_host(npdev->bus)->private_data;
-
- /* We only do bypass if it's enabled on the linked device */
- npe = &phb->ioda.pe_array[pdn->pe_number];
-
- if (bypass) {
- dev_info(&npdev->dev,
- "Using 64-bit DMA iommu bypass\n");
- pnv_npu_dma_set_bypass(npe);
- } else {
- dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
- pnv_npu_dma_set_32(npe);
- }
- }
-}
-
-#ifdef CONFIG_IOMMU_API
/* Switch ownership from platform code to external user (e.g. VFIO) */
static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
{
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 29ca523c1c79..a2aa5e433ac8 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
-OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ);
OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
@@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
+OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE);
+OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG);
+OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG);
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000000000000..ed895d82c048
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,636 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal core: " fmt
+
+#include <linux/memblock.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/crash_core.h>
+#include <linux/of.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+#define MAX_PT_LOAD_CNT 8
+
+/* NT_AUXV note related info */
+#define AUXV_CNT 1
+#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
+
+struct opalcore_config {
+ u32 num_cpus;
+ /* PIR value of crashing CPU */
+ u32 crashing_cpu;
+
+ /* CPU state data info from F/W */
+ u64 cpu_state_destination_vaddr;
+ u64 cpu_state_data_size;
+ u64 cpu_state_entry_size;
+
+ /* OPAL memory to be exported as PT_LOAD segments */
+ u64 ptload_addr[MAX_PT_LOAD_CNT];
+ u64 ptload_size[MAX_PT_LOAD_CNT];
+ u64 ptload_cnt;
+
+ /* Pointer to the first PT_LOAD in the ELF core file */
+ Elf64_Phdr *ptload_phdr;
+
+ /* Total size of opalcore file. */
+ size_t opalcore_size;
+
+ /* Buffer for all the ELF core headers and the PT_NOTE */
+ size_t opalcorebuf_sz;
+ char *opalcorebuf;
+
+ /* NT_AUXV buffer */
+ char auxv_buf[AUXV_DESC_SZ];
+};
+
+struct opalcore {
+ struct list_head list;
+ u64 paddr;
+ size_t size;
+ loff_t offset;
+};
+
+static LIST_HEAD(opalcore_list);
+static struct opalcore_config *oc_conf;
+static const struct opal_mpipl_fadump *opalc_metadata;
+static const struct opal_mpipl_fadump *opalc_cpu_metadata;
+
+/*
+ * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
+ * by kernel, SIGTERM otherwise.
+ */
+bool kernel_initiated;
+
+static struct opalcore * __init get_new_element(void)
+{
+ return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
+}
+
+static inline int is_opalcore_usable(void)
+{
+ return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
+}
+
+static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
+ u32 type, void *data,
+ size_t data_len)
+{
+ Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
+ Elf64_Word namesz = strlen(name) + 1;
+
+ note->n_namesz = cpu_to_be32(namesz);
+ note->n_descsz = cpu_to_be32(data_len);
+ note->n_type = cpu_to_be32(type);
+ buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
+ memcpy(buf, name, namesz);
+ buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
+ memcpy(buf, data, data_len);
+ buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
+
+ return buf;
+}
+
+static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
+ struct pt_regs *regs)
+{
+ memset(prstatus, 0, sizeof(struct elf_prstatus));
+ elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs);
+
+ /*
+ * Overload PID with PIR value.
+ * As a PIR value could also be '0', add an offset of '100'
+ * to every PIR to avoid misinterpretations in GDB.
+ */
+ prstatus->pr_pid = cpu_to_be32(100 + pir);
+ prstatus->pr_ppid = cpu_to_be32(1);
+
+ /*
+ * Indicate SIGUSR1 for crash initiated from kernel.
+ * SIGTERM otherwise.
+ */
+ if (pir == oc_conf->crashing_cpu) {
+ short sig;
+
+ sig = kernel_initiated ? SIGUSR1 : SIGTERM;
+ prstatus->pr_cursig = cpu_to_be16(sig);
+ }
+}
+
+static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf,
+ u64 opal_boot_entry)
+{
+ Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
+ int idx = 0;
+
+ memset(bufp, 0, AUXV_DESC_SZ);
+
+ /* Entry point of OPAL */
+ bufp[idx++] = cpu_to_be64(AT_ENTRY);
+ bufp[idx++] = cpu_to_be64(opal_boot_entry);
+
+ /* end of vector */
+ bufp[idx++] = cpu_to_be64(AT_NULL);
+
+ buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
+ oc_conf->auxv_buf, AUXV_DESC_SZ);
+ return buf;
+}
+
+/*
+ * Read from the ELF header and then the crash dump.
+ * Returns number of bytes read on success, -errno on failure.
+ */
+static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *to,
+ loff_t pos, size_t count)
+{
+ struct opalcore *m;
+ ssize_t tsz, avail;
+ loff_t tpos = pos;
+
+ if (pos >= oc_conf->opalcore_size)
+ return 0;
+
+ /* Adjust count if it goes beyond opalcore size */
+ avail = oc_conf->opalcore_size - pos;
+ if (count > avail)
+ count = avail;
+
+ if (count == 0)
+ return 0;
+
+ /* Read ELF core header and/or PT_NOTE segment */
+ if (tpos < oc_conf->opalcorebuf_sz) {
+ tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
+ memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
+ to += tsz;
+ tpos += tsz;
+ count -= tsz;
+ }
+
+ list_for_each_entry(m, &opalcore_list, list) {
+ /* nothing more to read here */
+ if (count == 0)
+ break;
+
+ if (tpos < m->offset + m->size) {
+ void *addr;
+
+ tsz = min_t(size_t, m->offset + m->size - tpos, count);
+ addr = (void *)(m->paddr + tpos - m->offset);
+ memcpy(to, __va(addr), tsz);
+ to += tsz;
+ tpos += tsz;
+ count -= tsz;
+ }
+ }
+
+ return (tpos - pos);
+}
+
+static struct bin_attribute opal_core_attr = {
+ .attr = {.name = "core", .mode = 0400},
+ .read = read_opalcore
+};
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ *
+ * Each register entry is of 16 bytes, A numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation.
+ */
+static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
+{
+ u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+ struct hdat_fadump_thread_hdr *thdr;
+ struct elf_prstatus prstatus;
+ Elf64_Word *first_cpu_note;
+ struct pt_regs regs;
+ char *bufp;
+ int i;
+
+ size_per_thread = oc_conf->cpu_state_entry_size;
+ bufp = __va(oc_conf->cpu_state_destination_vaddr);
+
+ /*
+ * Offset for register entries, entry size and registers count is
+ * duplicated in every thread header in keeping with HDAT format.
+ * Use these values from the first thread header.
+ */
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+ regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+ be32_to_cpu(thdr->offset));
+ reg_esize = be32_to_cpu(thdr->esize);
+ regs_cnt = be32_to_cpu(thdr->ecnt);
+
+ pr_debug("--------CPU State Data------------\n");
+ pr_debug("NumCpus : %u\n", oc_conf->num_cpus);
+ pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+ regs_offset, reg_esize, regs_cnt);
+
+ /*
+ * Skip past the first CPU note. Fill this note with the
+ * crashing CPU's prstatus.
+ */
+ first_cpu_note = buf;
+ buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+
+ for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+ thread_pir = be32_to_cpu(thdr->pir);
+
+ pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+ i, thread_pir, thdr->core_state);
+
+ /*
+ * Register state data of MAX cores is provided by firmware,
+ * but some of this cores may not be active. So, while
+ * processing register state data, check core state and
+ * skip threads that belong to inactive cores.
+ */
+ if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+ continue;
+
+ opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+ reg_esize, false, &regs);
+
+ pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
+ be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
+ fill_prstatus(&prstatus, thread_pir, &regs);
+
+ if (thread_pir != oc_conf->crashing_cpu) {
+ buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
+ NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ } else {
+ /*
+ * Add crashing CPU as the first NT_PRSTATUS note for
+ * GDB to process the core file appropriately.
+ */
+ append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
+ NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ }
+ }
+
+ return buf;
+}
+
+static int __init create_opalcore(void)
+{
+ u64 opal_boot_entry, opal_base_addr, paddr;
+ u32 hdr_size, cpu_notes_size, count;
+ struct device_node *dn;
+ struct opalcore *new;
+ loff_t opalcore_off;
+ struct page *page;
+ Elf64_Phdr *phdr;
+ Elf64_Ehdr *elf;
+ int i, ret;
+ char *bufp;
+
+ /* Get size of header & CPU notes for OPAL core */
+ hdr_size = (sizeof(Elf64_Ehdr) +
+ ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
+ cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
+ CRASH_CORE_NOTE_NAME_BYTES +
+ CRASH_CORE_NOTE_DESC_BYTES)) +
+ (CRASH_CORE_NOTE_HEAD_BYTES +
+ CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
+
+ /* Allocate buffer to setup OPAL core */
+ oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
+ oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!oc_conf->opalcorebuf) {
+ pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
+ oc_conf->opalcorebuf_sz);
+ oc_conf->opalcorebuf_sz = 0;
+ return -ENOMEM;
+ }
+ count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
+ page = virt_to_page(oc_conf->opalcorebuf);
+ for (i = 0; i < count; i++)
+ mark_page_reserved(page + i);
+
+ pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
+
+ /* Read OPAL related device-tree entries */
+ dn = of_find_node_by_name(NULL, "ibm,opal");
+ if (dn) {
+ ret = of_property_read_u64(dn, "opal-base-address",
+ &opal_base_addr);
+ pr_debug("opal-base-address: %llx\n", opal_base_addr);
+ ret |= of_property_read_u64(dn, "opal-boot-address",
+ &opal_boot_entry);
+ pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
+ }
+ if (!dn || ret)
+ pr_warn("WARNING: Failed to read OPAL base & entry values\n");
+
+ /* Use count to keep track of the program headers */
+ count = 0;
+
+ bufp = oc_conf->opalcorebuf;
+ elf = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(elf->e_ident, ELFMAG, SELFMAG);
+ elf->e_ident[EI_CLASS] = ELF_CLASS;
+ elf->e_ident[EI_DATA] = ELFDATA2MSB;
+ elf->e_ident[EI_VERSION] = EV_CURRENT;
+ elf->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+ elf->e_type = cpu_to_be16(ET_CORE);
+ elf->e_machine = cpu_to_be16(ELF_ARCH);
+ elf->e_version = cpu_to_be32(EV_CURRENT);
+ elf->e_entry = 0;
+ elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
+ elf->e_shoff = 0;
+ elf->e_flags = 0;
+
+ elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
+ elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
+ elf->e_phnum = 0;
+ elf->e_shentsize = 0;
+ elf->e_shnum = 0;
+ elf->e_shstrndx = 0;
+
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = cpu_to_be32(PT_NOTE);
+ phdr->p_flags = 0;
+ phdr->p_align = 0;
+ phdr->p_paddr = phdr->p_vaddr = 0;
+ phdr->p_offset = cpu_to_be64(hdr_size);
+ phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size);
+ count++;
+
+ opalcore_off = oc_conf->opalcorebuf_sz;
+ oc_conf->ptload_phdr = (Elf64_Phdr *)bufp;
+ paddr = 0;
+ for (i = 0; i < oc_conf->ptload_cnt; i++) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = cpu_to_be32(PT_LOAD);
+ phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X);
+ phdr->p_align = 0;
+
+ new = get_new_element();
+ if (!new)
+ return -ENOMEM;
+ new->paddr = oc_conf->ptload_addr[i];
+ new->size = oc_conf->ptload_size[i];
+ new->offset = opalcore_off;
+ list_add_tail(&new->list, &opalcore_list);
+
+ phdr->p_paddr = cpu_to_be64(paddr);
+ phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr);
+ phdr->p_filesz = phdr->p_memsz =
+ cpu_to_be64(oc_conf->ptload_size[i]);
+ phdr->p_offset = cpu_to_be64(opalcore_off);
+
+ count++;
+ opalcore_off += oc_conf->ptload_size[i];
+ paddr += oc_conf->ptload_size[i];
+ }
+
+ elf->e_phnum = cpu_to_be16(count);
+
+ bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
+ bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
+
+ oc_conf->opalcore_size = opalcore_off;
+ return 0;
+}
+
+static void opalcore_cleanup(void)
+{
+ if (oc_conf == NULL)
+ return;
+
+ /* Remove OPAL core sysfs file */
+ sysfs_remove_bin_file(opal_kobj, &opal_core_attr);
+ oc_conf->ptload_phdr = NULL;
+ oc_conf->ptload_cnt = 0;
+
+ /* free the buffer used for setting up OPAL core */
+ if (oc_conf->opalcorebuf) {
+ void *end = (void *)((u64)oc_conf->opalcorebuf +
+ oc_conf->opalcorebuf_sz);
+
+ free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+ oc_conf->opalcorebuf = NULL;
+ oc_conf->opalcorebuf_sz = 0;
+ }
+
+ kfree(oc_conf);
+ oc_conf = NULL;
+}
+__exitcall(opalcore_cleanup);
+
+static void __init opalcore_config_init(void)
+{
+ u32 idx, cpu_data_version;
+ struct device_node *np;
+ const __be32 *prop;
+ u64 addr = 0;
+ int i, ret;
+
+ np = of_find_node_by_path("/ibm,opal/dump");
+ if (np == NULL)
+ return;
+
+ if (!of_device_is_compatible(np, "ibm,opal-dump")) {
+ pr_warn("Support missing for this f/w version!\n");
+ return;
+ }
+
+ /* Check if dump has been initiated on last reboot */
+ prop = of_get_property(np, "mpipl-boot", NULL);
+ if (!prop) {
+ of_node_put(np);
+ return;
+ }
+
+ /* Get OPAL metadata */
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get OPAL metadata (%d)\n", ret);
+ goto error_out;
+ }
+
+ addr = be64_to_cpu(addr);
+ pr_debug("OPAL metadata addr: %llx\n", addr);
+ opalc_metadata = __va(addr);
+
+ /* Get OPAL CPU metadata */
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
+ goto error_out;
+ }
+
+ addr = be64_to_cpu(addr);
+ pr_debug("CPU metadata addr: %llx\n", addr);
+ opalc_cpu_metadata = __va(addr);
+
+ /* Allocate memory for config buffer */
+ oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
+ if (oc_conf == NULL)
+ goto error_out;
+
+ /* Parse OPAL metadata */
+ if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
+ pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
+ OPAL_MPIPL_VERSION, opalc_metadata->version);
+ pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
+ }
+
+ oc_conf->ptload_cnt = 0;
+ idx = be32_to_cpu(opalc_metadata->region_cnt);
+ if (idx > MAX_PT_LOAD_CNT) {
+ pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
+ MAX_PT_LOAD_CNT, idx);
+ idx = MAX_PT_LOAD_CNT;
+ }
+ for (i = 0; i < idx; i++) {
+ oc_conf->ptload_addr[oc_conf->ptload_cnt] =
+ be64_to_cpu(opalc_metadata->region[i].dest);
+ oc_conf->ptload_size[oc_conf->ptload_cnt++] =
+ be64_to_cpu(opalc_metadata->region[i].size);
+ }
+ oc_conf->ptload_cnt = i;
+ oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
+
+ if (!oc_conf->ptload_cnt) {
+ pr_err("OPAL memory regions not found\n");
+ goto error_out;
+ }
+
+ /* Parse OPAL CPU metadata */
+ cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
+ if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+ pr_warn("Supported CPU data version: %u, found: %u!\n",
+ HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
+ pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+ }
+
+ addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
+ if (!addr) {
+ pr_err("CPU state data not found!\n");
+ goto error_out;
+ }
+ oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
+
+ oc_conf->cpu_state_data_size =
+ be64_to_cpu(opalc_cpu_metadata->region[0].size);
+ oc_conf->cpu_state_entry_size =
+ be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
+
+ if ((oc_conf->cpu_state_entry_size == 0) ||
+ (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
+ pr_err("CPU state data is invalid.\n");
+ goto error_out;
+ }
+ oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
+ oc_conf->cpu_state_entry_size);
+
+ of_node_put(np);
+ return;
+
+error_out:
+ pr_err("Could not export /sys/firmware/opal/core\n");
+ opalcore_cleanup();
+ of_node_put(np);
+}
+
+static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int input = -1;
+
+ if (kstrtoint(buf, 0, &input))
+ return -EINVAL;
+
+ if (input == 1) {
+ if (oc_conf == NULL) {
+ pr_err("'/sys/firmware/opal/core' file not accessible!\n");
+ return -EPERM;
+ }
+
+ /*
+ * Take away '/sys/firmware/opal/core' and release all memory
+ * used for exporting this file.
+ */
+ opalcore_cleanup();
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore,
+ 0200, NULL,
+ fadump_release_opalcore_store);
+
+static int __init opalcore_init(void)
+{
+ int rc = -1;
+
+ opalcore_config_init();
+
+ if (oc_conf == NULL)
+ return rc;
+
+ create_opalcore();
+
+ /*
+ * If oc_conf->opalcorebuf= is set in the 2nd kernel,
+ * then capture the dump.
+ */
+ if (!(is_opalcore_usable())) {
+ pr_err("Failed to export /sys/firmware/opal/core\n");
+ opalcore_cleanup();
+ return rc;
+ }
+
+ /* Set OPAL core file size */
+ opal_core_attr.size = oc_conf->opalcore_size;
+
+ /* Export OPAL core sysfs file */
+ rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr);
+ if (rc != 0) {
+ pr_err("Failed to export /sys/firmware/opal/core\n");
+ opalcore_cleanup();
+ return rc;
+ }
+
+ rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr);
+ if (rc) {
+ pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n",
+ rc);
+ }
+
+ return 0;
+}
+fs_initcall(opalcore_init);
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
new file mode 100644
index 000000000000..d361d37d975f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+
+#ifdef CONFIG_PRESERVE_FA_DUMP
+/*
+ * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
+ * ensure crash data is preserved in hope that the subsequent memory
+ * preserving kernel boot is going to process this crash data.
+ */
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+ const struct opal_fadump_mem_struct *opal_fdm_active;
+ const __be32 *prop;
+ unsigned long dn;
+ u64 addr = 0;
+ s64 ret;
+
+ dn = of_get_flat_dt_subnode_by_name(node, "dump");
+ if (dn == -FDT_ERR_NOTFOUND)
+ return;
+
+ /*
+ * Check if dump has been initiated on last reboot.
+ */
+ prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+ if (!prop)
+ return;
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_debug("Could not get Kernel metadata (%lld)\n", ret);
+ return;
+ }
+
+ /*
+ * Preserve memory only if kernel memory regions are registered
+ * with f/w for MPIPL.
+ */
+ addr = be64_to_cpu(addr);
+ pr_debug("Kernel metadata addr: %llx\n", addr);
+ opal_fdm_active = (void *)addr;
+ if (opal_fdm_active->registered_regions == 0)
+ return;
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get boot memory tag (%lld)\n", ret);
+ return;
+ }
+
+ /*
+ * Memory below this address can be used for booting a
+ * capture kernel or petitboot kernel. Preserve everything
+ * above this address for processing crashdump.
+ */
+ fadump_conf->boot_mem_top = be64_to_cpu(addr);
+ pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top);
+
+ pr_info("Firmware-assisted dump is active.\n");
+ fadump_conf->dump_active = 1;
+}
+
+#else /* CONFIG_PRESERVE_FA_DUMP */
+static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
+static struct opal_fadump_mem_struct *opal_fdm;
+
+#ifdef CONFIG_OPAL_CORE
+extern bool kernel_initiated;
+#endif
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf);
+
+static void opal_fadump_update_config(struct fw_dump *fadump_conf,
+ const struct opal_fadump_mem_struct *fdm)
+{
+ pr_debug("Boot memory regions count: %d\n", fdm->region_cnt);
+
+ /*
+ * The destination address of the first boot memory region is the
+ * destination address of boot memory regions.
+ */
+ fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest;
+ pr_debug("Destination address of boot memory regions: %#016llx\n",
+ fadump_conf->boot_mem_dest_addr);
+
+ fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * from metadata setup by the first kernel.
+ */
+static void opal_fadump_get_config(struct fw_dump *fadump_conf,
+ const struct opal_fadump_mem_struct *fdm)
+{
+ unsigned long base, size, last_end, hole_size;
+ int i;
+
+ if (!fadump_conf->dump_active)
+ return;
+
+ last_end = 0;
+ hole_size = 0;
+ fadump_conf->boot_memory_size = 0;
+
+ pr_debug("Boot memory regions:\n");
+ for (i = 0; i < fdm->region_cnt; i++) {
+ base = fdm->rgn[i].src;
+ size = fdm->rgn[i].size;
+ pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
+
+ fadump_conf->boot_mem_addr[i] = base;
+ fadump_conf->boot_mem_sz[i] = size;
+ fadump_conf->boot_memory_size += size;
+ hole_size += (base - last_end);
+
+ last_end = base + size;
+ }
+
+ /*
+ * Start address of reserve dump area (permanent reservation) for
+ * re-registering FADump after dump capture.
+ */
+ fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
+
+ /*
+ * Rarely, but it can so happen that system crashes before all
+ * boot memory regions are registered for MPIPL. In such
+ * cases, warn that the vmcore may not be accurate and proceed
+ * anyway as that is the best bet considering free pages, cache
+ * pages, user pages, etc are usually filtered out.
+ *
+ * Hope the memory that could not be preserved only has pages
+ * that are usually filtered out while saving the vmcore.
+ */
+ if (fdm->region_cnt > fdm->registered_regions) {
+ pr_warn("Not all memory regions were saved!!!\n");
+ pr_warn(" Unsaved memory regions:\n");
+ i = fdm->registered_regions;
+ while (i < fdm->region_cnt) {
+ pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
+ i, fdm->rgn[i].src, fdm->rgn[i].size);
+ i++;
+ }
+
+ pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n");
+ pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
+ }
+
+ fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
+ fadump_conf->boot_mem_regs_cnt = fdm->region_cnt;
+ opal_fadump_update_config(fadump_conf, fdm);
+}
+
+/* Initialize kernel metadata */
+static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
+{
+ fdm->version = OPAL_FADUMP_VERSION;
+ fdm->region_cnt = 0;
+ fdm->registered_regions = 0;
+ fdm->fadumphdr_addr = 0;
+}
+
+static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+ u64 addr = fadump_conf->reserve_dump_area_start;
+ int i;
+
+ opal_fdm = __va(fadump_conf->kernel_metadata);
+ opal_fadump_init_metadata(opal_fdm);
+
+ /* Boot memory regions */
+ for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
+ opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i];
+ opal_fdm->rgn[i].dest = addr;
+ opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i];
+
+ opal_fdm->region_cnt++;
+ addr += fadump_conf->boot_mem_sz[i];
+ }
+
+ /*
+ * Kernel metadata is passed to f/w and retrieved in capture kerenl.
+ * So, use it to save fadump header address instead of calculating it.
+ */
+ opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest +
+ fadump_conf->boot_memory_size);
+
+ opal_fadump_update_config(fadump_conf, opal_fdm);
+
+ return addr;
+}
+
+static u64 opal_fadump_get_metadata_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct));
+}
+
+static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
+{
+ int err = 0;
+ s64 ret;
+
+ /*
+ * Use the last page(s) in FADump memory reservation for
+ * kernel metadata.
+ */
+ fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start +
+ fadump_conf->reserve_dump_area_size -
+ opal_fadump_get_metadata_size());
+ pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata);
+
+ /* Initialize kernel metadata before registering the address with f/w */
+ opal_fdm = __va(fadump_conf->kernel_metadata);
+ opal_fadump_init_metadata(opal_fdm);
+
+ /*
+ * Register metadata address with f/w. Can be retrieved in
+ * the capture kernel.
+ */
+ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL,
+ fadump_conf->kernel_metadata);
+ if (ret != OPAL_SUCCESS) {
+ pr_err("Failed to set kernel metadata tag!\n");
+ err = -EPERM;
+ }
+
+ /*
+ * Register boot memory top address with f/w. Should be retrieved
+ * by a kernel that intends to preserve crash'ed kernel's memory.
+ */
+ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
+ fadump_conf->boot_mem_top);
+ if (ret != OPAL_SUCCESS) {
+ pr_err("Failed to set boot memory tag!\n");
+ err = -EPERM;
+ }
+
+ return err;
+}
+
+static u64 opal_fadump_get_bootmem_min(void)
+{
+ return OPAL_FADUMP_MIN_BOOT_MEM;
+}
+
+static int opal_fadump_register(struct fw_dump *fadump_conf)
+{
+ s64 rc = OPAL_PARAMETER;
+ int i, err = -EIO;
+
+ for (i = 0; i < opal_fdm->region_cnt; i++) {
+ rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
+ opal_fdm->rgn[i].src,
+ opal_fdm->rgn[i].dest,
+ opal_fdm->rgn[i].size);
+ if (rc != OPAL_SUCCESS)
+ break;
+
+ opal_fdm->registered_regions++;
+ }
+
+ switch (rc) {
+ case OPAL_SUCCESS:
+ pr_info("Registration is successful!\n");
+ fadump_conf->dump_registered = 1;
+ err = 0;
+ break;
+ case OPAL_RESOURCE:
+ /* If MAX regions limit in f/w is hit, warn and proceed. */
+ pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
+ (opal_fdm->region_cnt - opal_fdm->registered_regions));
+ fadump_conf->dump_registered = 1;
+ err = 0;
+ break;
+ case OPAL_PARAMETER:
+ pr_err("Failed to register. Parameter Error(%lld).\n", rc);
+ break;
+ case OPAL_HARDWARE:
+ pr_err("Support not available.\n");
+ fadump_conf->fadump_supported = 0;
+ fadump_conf->fadump_enabled = 0;
+ break;
+ default:
+ pr_err("Failed to register. Unknown Error(%lld).\n", rc);
+ break;
+ }
+
+ /*
+ * If some regions were registered before OPAL_MPIPL_ADD_RANGE
+ * OPAL call failed, unregister all regions.
+ */
+ if ((err < 0) && (opal_fdm->registered_regions > 0))
+ opal_fadump_unregister(fadump_conf);
+
+ return err;
+}
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf)
+{
+ s64 rc;
+
+ rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0);
+ if (rc) {
+ pr_err("Failed to un-register - unexpected Error(%lld).\n", rc);
+ return -EIO;
+ }
+
+ opal_fdm->registered_regions = 0;
+ fadump_conf->dump_registered = 0;
+ return 0;
+}
+
+static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+ s64 rc;
+
+ rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
+ if (rc) {
+ pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
+ return -EIO;
+ }
+
+ fadump_conf->dump_active = 0;
+ opal_fdm_active = NULL;
+ return 0;
+}
+
+static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
+{
+ s64 ret;
+
+ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0);
+ if (ret != OPAL_SUCCESS)
+ pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
+}
+
+/*
+ * Verify if CPU state data is available. If available, do a bit of sanity
+ * checking before processing this data.
+ */
+static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf)
+{
+ if (!opal_cpu_metadata)
+ return false;
+
+ fadump_conf->cpu_state_data_version =
+ be32_to_cpu(opal_cpu_metadata->cpu_data_version);
+ fadump_conf->cpu_state_entry_size =
+ be32_to_cpu(opal_cpu_metadata->cpu_data_size);
+ fadump_conf->cpu_state_dest_vaddr =
+ (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest));
+ fadump_conf->cpu_state_data_size =
+ be64_to_cpu(opal_cpu_metadata->region[0].size);
+
+ if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+ pr_warn("Supported CPU state data version: %u, found: %d!\n",
+ HDAT_FADUMP_CPU_DATA_VER,
+ fadump_conf->cpu_state_data_version);
+ pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+ }
+
+ if ((fadump_conf->cpu_state_dest_vaddr == 0) ||
+ (fadump_conf->cpu_state_entry_size == 0) ||
+ (fadump_conf->cpu_state_entry_size >
+ fadump_conf->cpu_state_data_size)) {
+ pr_err("CPU state data is invalid. Ignoring!\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Convert CPU state data saved at the time of crash into ELF notes.
+ *
+ * While the crashing CPU's register data is saved by the kernel, CPU state
+ * data for all CPUs is saved by f/w. In CPU state data provided by f/w,
+ * each register entry is of 16 bytes, a numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation. If this data is
+ * missing or in unsupported format, append crashing CPU's register data
+ * saved by the kernel in the PT_NOTE, to have something to work with in
+ * the vmcore file.
+ */
+static int __init
+opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
+ struct fadump_crash_info_header *fdh)
+{
+ u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+ struct hdat_fadump_thread_hdr *thdr;
+ bool is_cpu_data_valid = false;
+ u32 num_cpus = 1, *note_buf;
+ struct pt_regs regs;
+ char *bufp;
+ int rc, i;
+
+ if (is_opal_fadump_cpu_data_valid(fadump_conf)) {
+ size_per_thread = fadump_conf->cpu_state_entry_size;
+ num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread);
+ bufp = __va(fadump_conf->cpu_state_dest_vaddr);
+ is_cpu_data_valid = true;
+ }
+
+ rc = fadump_setup_cpu_notes_buf(num_cpus);
+ if (rc != 0)
+ return rc;
+
+ note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+ if (!is_cpu_data_valid)
+ goto out;
+
+ /*
+ * Offset for register entries, entry size and registers count is
+ * duplicated in every thread header in keeping with HDAT format.
+ * Use these values from the first thread header.
+ */
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+ regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+ be32_to_cpu(thdr->offset));
+ reg_esize = be32_to_cpu(thdr->esize);
+ regs_cnt = be32_to_cpu(thdr->ecnt);
+
+ pr_debug("--------CPU State Data------------\n");
+ pr_debug("NumCpus : %u\n", num_cpus);
+ pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+ regs_offset, reg_esize, regs_cnt);
+
+ for (i = 0; i < num_cpus; i++, bufp += size_per_thread) {
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+
+ thread_pir = be32_to_cpu(thdr->pir);
+ pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+ i, thread_pir, thdr->core_state);
+
+ /*
+ * If this is kernel initiated crash, crashing_cpu would be set
+ * appropriately and register data of the crashing CPU saved by
+ * crashing kernel. Add this saved register data of crashing CPU
+ * to elf notes and populate the pt_regs for the remaining CPUs
+ * from register state data provided by firmware.
+ */
+ if (fdh->crashing_cpu == thread_pir) {
+ note_buf = fadump_regs_to_elf_notes(note_buf,
+ &fdh->regs);
+ pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+ fdh->crashing_cpu, fdh->regs.gpr[1],
+ fdh->regs.nip);
+ continue;
+ }
+
+ /*
+ * Register state data of MAX cores is provided by firmware,
+ * but some of this cores may not be active. So, while
+ * processing register state data, check core state and
+ * skip threads that belong to inactive cores.
+ */
+ if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+ continue;
+
+ opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+ reg_esize, true, &regs);
+ note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+ pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+ thread_pir, regs.gpr[1], regs.nip);
+ }
+
+out:
+ /*
+ * CPU state data is invalid/unsupported. Try appending crashing CPU's
+ * register data, if it is saved by the kernel.
+ */
+ if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) {
+ if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) {
+ fadump_free_cpu_notes_buf();
+ return -ENODEV;
+ }
+
+ pr_warn("WARNING: appending only crashing CPU's register data\n");
+ note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+ }
+
+ final_note(note_buf);
+
+ pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+ fdh->elfcorehdr_addr);
+ fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+ return 0;
+}
+
+static int __init opal_fadump_process(struct fw_dump *fadump_conf)
+{
+ struct fadump_crash_info_header *fdh;
+ int rc = -EINVAL;
+
+ if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+ return rc;
+
+ /* Validate the fadump crash info header */
+ fdh = __va(fadump_conf->fadumphdr_addr);
+ if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+ pr_err("Crash info header is not valid.\n");
+ return rc;
+ }
+
+#ifdef CONFIG_OPAL_CORE
+ /*
+ * If this is a kernel initiated crash, crashing_cpu would be set
+ * appropriately and register data of the crashing CPU saved by
+ * crashing kernel. Add this saved register data of crashing CPU
+ * to elf notes and populate the pt_regs for the remaining CPUs
+ * from register state data provided by firmware.
+ */
+ if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)
+ kernel_initiated = true;
+#endif
+
+ rc = opal_fadump_build_cpu_notes(fadump_conf, fdh);
+ if (rc)
+ return rc;
+
+ /*
+ * We are done validating dump info and elfcore header is now ready
+ * to be exported. set elfcorehdr_addr so that vmcore module will
+ * export the elfcore header through '/proc/vmcore'.
+ */
+ elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+ return rc;
+}
+
+static void opal_fadump_region_show(struct fw_dump *fadump_conf,
+ struct seq_file *m)
+{
+ const struct opal_fadump_mem_struct *fdm_ptr;
+ u64 dumped_bytes = 0;
+ int i;
+
+ if (fadump_conf->dump_active)
+ fdm_ptr = opal_fdm_active;
+ else
+ fdm_ptr = opal_fdm;
+
+ for (i = 0; i < fdm_ptr->region_cnt; i++) {
+ /*
+ * Only regions that are registered for MPIPL
+ * would have dump data.
+ */
+ if ((fadump_conf->dump_active) &&
+ (i < fdm_ptr->registered_regions))
+ dumped_bytes = fdm_ptr->rgn[i].size;
+
+ seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+ fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
+ seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+ fdm_ptr->rgn[i].size, dumped_bytes);
+ }
+
+ /* Dump is active. Show reserved area start address. */
+ if (fadump_conf->dump_active) {
+ seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+ fadump_conf->reserve_dump_area_start);
+ }
+}
+
+static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
+ const char *msg)
+{
+ int rc;
+
+ /*
+ * Unlike on pSeries platform, logical CPU number is not provided
+ * with architected register state data. So, store the crashing
+ * CPU's PIR instead to plug the appropriate register data for
+ * crashing CPU in the vmcore file.
+ */
+ fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
+
+ rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
+ if (rc == OPAL_UNSUPPORTED) {
+ pr_emerg("Reboot type %d not supported.\n",
+ OPAL_REBOOT_MPIPL);
+ } else if (rc == OPAL_HARDWARE)
+ pr_emerg("No backend support for MPIPL!\n");
+}
+
+static struct fadump_ops opal_fadump_ops = {
+ .fadump_init_mem_struct = opal_fadump_init_mem_struct,
+ .fadump_get_metadata_size = opal_fadump_get_metadata_size,
+ .fadump_setup_metadata = opal_fadump_setup_metadata,
+ .fadump_get_bootmem_min = opal_fadump_get_bootmem_min,
+ .fadump_register = opal_fadump_register,
+ .fadump_unregister = opal_fadump_unregister,
+ .fadump_invalidate = opal_fadump_invalidate,
+ .fadump_cleanup = opal_fadump_cleanup,
+ .fadump_process = opal_fadump_process,
+ .fadump_region_show = opal_fadump_region_show,
+ .fadump_trigger = opal_fadump_trigger,
+};
+
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+ const __be32 *prop;
+ unsigned long dn;
+ u64 addr = 0;
+ int i, len;
+ s64 ret;
+
+ /*
+ * Check if Firmware-Assisted Dump is supported. if yes, check
+ * if dump has been initiated on last reboot.
+ */
+ dn = of_get_flat_dt_subnode_by_name(node, "dump");
+ if (dn == -FDT_ERR_NOTFOUND) {
+ pr_debug("FADump support is missing!\n");
+ return;
+ }
+
+ if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) {
+ pr_err("Support missing for this f/w version!\n");
+ return;
+ }
+
+ prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
+ if (prop) {
+ /*
+ * Each f/w load area is an (address,size) pair,
+ * 2 cells each, totalling 4 cells per range.
+ */
+ for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+ u64 base, end;
+
+ base = of_read_number(prop + (i * 4) + 0, 2);
+ end = base;
+ end += of_read_number(prop + (i * 4) + 2, 2);
+ if (end > OPAL_FADUMP_MIN_BOOT_MEM) {
+ pr_err("F/W load area: 0x%llx-0x%llx\n",
+ base, end);
+ pr_err("F/W version not supported!\n");
+ return;
+ }
+ }
+ }
+
+ fadump_conf->ops = &opal_fadump_ops;
+ fadump_conf->fadump_supported = 1;
+
+ /*
+ * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
+ * and request firmware to copy multiple kernel boot memory regions.
+ */
+ fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+
+ /*
+ * Check if dump has been initiated on last reboot.
+ */
+ prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+ if (!prop)
+ return;
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get Kernel metadata (%lld)\n", ret);
+ return;
+ }
+
+ addr = be64_to_cpu(addr);
+ pr_debug("Kernel metadata addr: %llx\n", addr);
+
+ opal_fdm_active = __va(addr);
+ if (opal_fdm_active->version != OPAL_FADUMP_VERSION) {
+ pr_warn("Supported kernel metadata version: %u, found: %d!\n",
+ OPAL_FADUMP_VERSION, opal_fdm_active->version);
+ pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n");
+ }
+
+ /* Kernel regions not registered with f/w for MPIPL */
+ if (opal_fdm_active->registered_regions == 0) {
+ opal_fdm_active = NULL;
+ return;
+ }
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+ if (addr) {
+ addr = be64_to_cpu(addr);
+ pr_debug("CPU metadata addr: %llx\n", addr);
+ opal_cpu_metadata = __va(addr);
+ }
+
+ pr_info("Firmware-assisted dump is active.\n");
+ fadump_conf->dump_active = 1;
+ opal_fadump_get_config(fadump_conf, opal_fdm_active);
+}
+#endif /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
new file mode 100644
index 000000000000..f1e9ecf548c5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _POWERNV_OPAL_FADUMP_H
+#define _POWERNV_OPAL_FADUMP_H
+
+#include <asm/reg.h>
+
+/*
+ * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
+ * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
+ * mess with crash'ed kernel's memory during MPIPL.
+ */
+#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL)
+
+/*
+ * OPAL FADump metadata structure format version
+ *
+ * OPAL FADump kernel metadata structure stores kernel metadata needed to
+ * register-for/process crash dump. Format version is used to keep a tab on
+ * the changes in the structure format. The changes, if any, to the format
+ * are expected to be minimal and backward compatible.
+ */
+#define OPAL_FADUMP_VERSION 0x1
+
+/*
+ * OPAL FADump kernel metadata
+ *
+ * The address of this structure will be registered with f/w for retrieving
+ * and processing during crash dump.
+ */
+struct opal_fadump_mem_struct {
+ u8 version;
+ u8 reserved[3];
+ u16 region_cnt; /* number of regions */
+ u16 registered_regions; /* Regions registered for MPIPL */
+ u64 fadumphdr_addr;
+ struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS];
+} __packed;
+
+/*
+ * CPU state data
+ *
+ * CPU state data information is provided by f/w. The format for this data
+ * is defined in the HDAT spec. Version is used to keep a tab on the changes
+ * in this CPU state data format. Changes to this format are unlikely, but
+ * if there are any changes, please refer to latest HDAT specification.
+ */
+#define HDAT_FADUMP_CPU_DATA_VER 1
+
+#define HDAT_FADUMP_CORE_INACTIVE (0x0F)
+
+/* HDAT thread header for register entries */
+struct hdat_fadump_thread_hdr {
+ __be32 pir;
+ /* 0x00 - 0x0F - The corresponding stop state of the core */
+ u8 core_state;
+ u8 reserved[3];
+
+ __be32 offset; /* Offset to Register Entries array */
+ __be32 ecnt; /* Number of entries */
+ __be32 esize; /* Alloc size of each array entry in bytes */
+ __be32 eactsz; /* Actual size of each array entry in bytes */
+} __packed;
+
+/* Register types populated by f/w */
+#define HDAT_FADUMP_REG_TYPE_GPR 0x01
+#define HDAT_FADUMP_REG_TYPE_SPR 0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define HDAT_FADUMP_REG_ID_NIP 0x7D0
+#define HDAT_FADUMP_REG_ID_MSR 0x7D1
+#define HDAT_FADUMP_REG_ID_CCR 0x7D2
+
+/* HDAT register entry. */
+struct hdat_fadump_reg_entry {
+ __be32 reg_type;
+ __be32 reg_num;
+ __be64 reg_val;
+} __packed;
+
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+ u32 reg_type, u32 reg_num,
+ u64 reg_val)
+{
+ if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+ if (reg_num < 32)
+ regs->gpr[reg_num] = reg_val;
+ return;
+ }
+
+ switch (reg_num) {
+ case SPRN_CTR:
+ regs->ctr = reg_val;
+ break;
+ case SPRN_LR:
+ regs->link = reg_val;
+ break;
+ case SPRN_XER:
+ regs->xer = reg_val;
+ break;
+ case SPRN_DAR:
+ regs->dar = reg_val;
+ break;
+ case SPRN_DSISR:
+ regs->dsisr = reg_val;
+ break;
+ case HDAT_FADUMP_REG_ID_NIP:
+ regs->nip = reg_val;
+ break;
+ case HDAT_FADUMP_REG_ID_MSR:
+ regs->msr = reg_val;
+ break;
+ case HDAT_FADUMP_REG_ID_CCR:
+ regs->ccr = reg_val;
+ break;
+ }
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+ unsigned int reg_entry_size,
+ bool cpu_endian,
+ struct pt_regs *regs)
+{
+ struct hdat_fadump_reg_entry *reg_entry;
+ u64 val;
+ int i;
+
+ memset(regs, 0, sizeof(struct pt_regs));
+
+ for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+ reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+ val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
+ reg_entry->reg_val);
+ opal_fadump_set_regval_regnum(regs,
+ be32_to_cpu(reg_entry->reg_type),
+ be32_to_cpu(reg_entry->reg_num),
+ val);
+ }
+}
+
+#endif /* _POWERNV_OPAL_FADUMP_H */
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 186109bdd41b..e04b20625cb9 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node,
struct imc_pmu *pmu_ptr)
{
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
- int chip = 0, nid;
char mode[16], cmd[16];
u32 cb_offset;
+ struct imc_mem_info *ptr = pmu_ptr->mem_info;
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
@@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node,
if (of_property_read_u32(node, "cb_offset", &cb_offset))
cb_offset = IMC_CNTL_BLK_OFFSET;
- for_each_node(nid) {
- loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+ while (ptr->vbase != NULL) {
+ loc = (u64)(ptr->vbase) + cb_offset;
imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
- sprintf(mode, "imc_mode_%d", nid);
+ sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
imc_mode_addr))
goto err;
imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
- sprintf(cmd, "imc_cmd_%d", nid);
+ sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
imc_cmd_addr))
goto err;
- chip++;
+ ptr++;
}
return;
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index dc51d03c6370..d26da19a611f 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -29,23 +29,23 @@ struct memcons {
static struct memcons *opal_memcons = NULL;
-ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count)
{
const char *conbuf;
ssize_t ret;
size_t first_read = 0;
uint32_t out_pos, avail;
- if (!opal_memcons)
+ if (!mc)
return -ENODEV;
- out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
+ out_pos = be32_to_cpu(READ_ONCE(mc->out_pos));
/* Now we've read out_pos, put a barrier in before reading the new
* data it points to in conbuf. */
smp_rmb();
- conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys));
+ conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
/* When the buffer has wrapped, read from the out_pos marker to the end
* of the buffer, and then read the remaining data as in the un-wrapped
@@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
if (out_pos & MEMCONS_OUT_POS_WRAP) {
out_pos &= MEMCONS_OUT_POS_MASK;
- avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos;
+ avail = be32_to_cpu(mc->obuf_size) - out_pos;
ret = memory_read_from_buffer(to, count, &pos,
conbuf + out_pos, avail);
@@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
}
/* Sanity check. The firmware should not do this to us. */
- if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) {
+ if (out_pos > be32_to_cpu(mc->obuf_size)) {
pr_err("OPAL: memory console corruption. Aborting read.\n");
return -EINVAL;
}
@@ -86,6 +86,11 @@ out:
return ret;
}
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+{
+ return memcons_copy(opal_memcons, to, pos, count);
+}
+
static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
struct bin_attribute *bin_attr, char *to,
loff_t pos, size_t count)
@@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = {
.read = opal_msglog_read
};
-void __init opal_msglog_init(void)
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name)
{
u64 mcaddr;
struct memcons *mc;
- if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
- pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
- return;
+ if (of_property_read_u64(node, mc_prop_name, &mcaddr)) {
+ pr_warn("%s property not found, no message log\n",
+ mc_prop_name);
+ goto out_err;
}
mc = phys_to_virt(mcaddr);
if (!mc) {
- pr_warn("OPAL: memory console address is invalid\n");
- return;
+ pr_warn("memory console address is invalid\n");
+ goto out_err;
}
if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
- pr_warn("OPAL: memory console version is invalid\n");
- return;
+ pr_warn("memory console version is invalid\n");
+ goto out_err;
}
- /* Report maximum size */
- opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) +
- be32_to_cpu(mc->obuf_size);
+ return mc;
+
+out_err:
+ return NULL;
+}
+
+u32 memcons_get_size(struct memcons *mc)
+{
+ return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
+}
+
+void __init opal_msglog_init(void)
+{
+ opal_memcons = memcons_init(opal_node, "ibm,opal-memcons");
+ if (!opal_memcons) {
+ pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n");
+ return;
+ }
- opal_memcons = mc;
+ opal_msglog_attr.size = memcons_get_size(opal_memcons);
}
void __init opal_msglog_sysfs_init(void)
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index e072bf157d62..45f4223a790f 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
int msg_size, item_size;
unsigned long flags;
- if (msg_type != OPAL_MSG_PRD)
+ if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2)
return 0;
/* Calculate total size of the message and item we need to store. The
@@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev)
return rc;
}
+ rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb);
+ if (rc) {
+ pr_err("Couldn't register PRD2 event notifier\n");
+ return rc;
+ }
+
rc = misc_register(&opal_prd_dev);
if (rc) {
pr_err("failed to register miscdev\n");
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 66430eebe869..fd510d961b8c 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -1,7 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * PowerNV LPC bus handling.
+ * PowerNV SCOM bus debugfs interface
*
+ * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
+ * <benh@kernel.crashing.org>
+ * and David Gibson, IBM Corporation.
* Copyright 2013 IBM Corp.
*/
@@ -10,62 +13,13 @@
#include <linux/bug.h>
#include <linux/gfp.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
-#include <asm/scom.h>
-
-/*
- * We could probably fit that inside the scom_map_t
- * which is a void* after all but it's really too ugly
- * so let's kmalloc it for now
- */
-struct opal_scom_map {
- uint32_t chip;
- uint64_t addr;
-};
-
-static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
-{
- struct opal_scom_map *m;
- const __be32 *gcid;
-
- if (!of_get_property(dev, "scom-controller", NULL)) {
- pr_err("%s: device %pOF is not a SCOM controller\n",
- __func__, dev);
- return SCOM_MAP_INVALID;
- }
- gcid = of_get_property(dev, "ibm,chip-id", NULL);
- if (!gcid) {
- pr_err("%s: device %pOF has no ibm,chip-id\n",
- __func__, dev);
- return SCOM_MAP_INVALID;
- }
- m = kmalloc(sizeof(*m), GFP_KERNEL);
- if (!m)
- return NULL;
- m->chip = be32_to_cpup(gcid);
- m->addr = reg;
-
- return (scom_map_t)m;
-}
-
-static void opal_scom_unmap(scom_map_t map)
-{
- kfree(map);
-}
-
-static int opal_xscom_err_xlate(int64_t rc)
-{
- switch(rc) {
- case 0:
- return 0;
- /* Add more translations if necessary */
- default:
- return -EIO;
- }
-}
+#include <asm/debugfs.h>
+#include <asm/prom.h>
static u64 opal_scom_unmangle(u64 addr)
{
@@ -98,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr)
return addr;
}
-static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value)
{
- struct opal_scom_map *m = map;
int64_t rc;
__be64 v;
- reg = opal_scom_unmangle(m->addr + reg);
- rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+ reg = opal_scom_unmangle(addr + reg);
+ rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v));
+ if (rc) {
+ *value = 0xfffffffffffffffful;
+ return -EIO;
+ }
*value = be64_to_cpu(v);
- return opal_xscom_err_xlate(rc);
+ return 0;
}
-static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
+static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value)
{
- struct opal_scom_map *m = map;
int64_t rc;
- reg = opal_scom_unmangle(m->addr + reg);
- rc = opal_xscom_write(m->chip, reg, value);
- return opal_xscom_err_xlate(rc);
+ reg = opal_scom_unmangle(addr + reg);
+ rc = opal_xscom_write(chip, reg, value);
+ if (rc)
+ return -EIO;
+ return 0;
+}
+
+struct scom_debug_entry {
+ u32 chip;
+ struct debugfs_blob_wrapper path;
+ char name[16];
+};
+
+static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct scom_debug_entry *ent = filp->private_data;
+ u64 __user *ubuf64 = (u64 __user *)ubuf;
+ loff_t off = *ppos;
+ ssize_t done = 0;
+ u64 reg, reg_base, reg_cnt, val;
+ int rc;
+
+ if (off < 0 || (off & 7) || (count & 7))
+ return -EINVAL;
+ reg_base = off >> 3;
+ reg_cnt = count >> 3;
+
+ for (reg = 0; reg < reg_cnt; reg++) {
+ rc = opal_scom_read(ent->chip, reg_base, reg, &val);
+ if (!rc)
+ rc = put_user(val, ubuf64);
+ if (rc) {
+ if (!done)
+ done = rc;
+ break;
+ }
+ ubuf64++;
+ *ppos += 8;
+ done += 8;
+ }
+ return done;
+}
+
+static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct scom_debug_entry *ent = filp->private_data;
+ u64 __user *ubuf64 = (u64 __user *)ubuf;
+ loff_t off = *ppos;
+ ssize_t done = 0;
+ u64 reg, reg_base, reg_cnt, val;
+ int rc;
+
+ if (off < 0 || (off & 7) || (count & 7))
+ return -EINVAL;
+ reg_base = off >> 3;
+ reg_cnt = count >> 3;
+
+ for (reg = 0; reg < reg_cnt; reg++) {
+ rc = get_user(val, ubuf64);
+ if (!rc)
+ rc = opal_scom_write(ent->chip, reg_base, reg, val);
+ if (rc) {
+ if (!done)
+ done = rc;
+ break;
+ }
+ ubuf64++;
+ done += 8;
+ }
+ return done;
}
-static const struct scom_controller opal_scom_controller = {
- .map = opal_scom_map,
- .unmap = opal_scom_unmap,
- .read = opal_scom_read,
- .write = opal_scom_write
+static const struct file_operations scom_debug_fops = {
+ .read = scom_debug_read,
+ .write = scom_debug_write,
+ .open = simple_open,
+ .llseek = default_llseek,
};
-static int opal_xscom_init(void)
+static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
+ int chip)
{
- if (firmware_has_feature(FW_FEATURE_OPAL))
- scom_init(&opal_scom_controller);
+ struct scom_debug_entry *ent;
+ struct dentry *dir;
+
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->chip = chip;
+ snprintf(ent->name, 16, "%08x", chip);
+ ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
+ ent->path.size = strlen((char *)ent->path.data);
+
+ dir = debugfs_create_dir(ent->name, root);
+ if (!dir) {
+ kfree(ent->path.data);
+ kfree(ent);
+ return -1;
+ }
+
+ debugfs_create_blob("devspec", 0400, dir, &ent->path);
+ debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
+
return 0;
}
-machine_arch_initcall(powernv, opal_xscom_init);
+
+static int scom_debug_init(void)
+{
+ struct device_node *dn;
+ struct dentry *root;
+ int chip, rc;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return 0;
+
+ root = debugfs_create_dir("scom", powerpc_debugfs_root);
+ if (!root)
+ return -1;
+
+ rc = 0;
+ for_each_node_with_property(dn, "scom-controller") {
+ chip = of_get_ibm_chip_id(dn);
+ WARN_ON(chip == -1);
+ rc |= scom_debug_init_one(root, dn, chip);
+ }
+
+ return rc;
+}
+device_initcall(scom_debug_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index aba443be7daa..38e90270280b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock);
static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
static uint32_t opal_heartbeat;
static struct task_struct *kopald_tsk;
+static struct opal_msg *opal_msg;
+static u32 opal_msg_size __ro_after_init;
void opal_configure_cores(void)
{
@@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg)
static void opal_handle_message(void)
{
s64 ret;
- /*
- * TODO: pre-allocate a message buffer depending on opal-msg-size
- * value in /proc/device-tree.
- */
- static struct opal_msg msg;
u32 type;
- ret = opal_get_msg(__pa(&msg), sizeof(msg));
+ ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
/* No opal message pending. */
if (ret == OPAL_RESOURCE)
return;
@@ -290,14 +287,14 @@ static void opal_handle_message(void)
return;
}
- type = be32_to_cpu(msg.msg_type);
+ type = be32_to_cpu(opal_msg->msg_type);
/* Sanity check */
if (type >= OPAL_MSG_TYPE_MAX) {
pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
return;
}
- opal_message_do_notify(type, (void *)&msg);
+ opal_message_do_notify(type, (void *)opal_msg);
}
static irqreturn_t opal_message_notify(int irq, void *data)
@@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data)
return IRQ_HANDLED;
}
-static int __init opal_message_init(void)
+static int __init opal_message_init(struct device_node *opal_node)
{
int ret, i, irq;
+ ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
+ if (ret) {
+ pr_notice("Failed to read opal-msg-size property\n");
+ opal_msg_size = sizeof(struct opal_msg);
+ }
+
+ opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+ if (!opal_msg) {
+ opal_msg_size = sizeof(struct opal_msg);
+ /* Try to allocate fixed message size */
+ opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+ BUG_ON(opal_msg == NULL);
+ }
+
for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
@@ -705,7 +716,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
bin_attr->size);
}
-static BIN_ATTR_RO(symbol_map, 0);
+static struct bin_attribute symbol_map_attr = {
+ .attr = {.name = "symbol_map", .mode = 0400},
+ .read = symbol_map_read
+};
static void opal_export_symmap(void)
{
@@ -722,10 +736,10 @@ static void opal_export_symmap(void)
return;
/* Setup attributes */
- bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0]));
- bin_attr_symbol_map.size = be64_to_cpu(syms[1]);
+ symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
+ symbol_map_attr.size = be64_to_cpu(syms[1]);
- rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map);
+ rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
if (rc)
pr_warn("Error %d creating OPAL symbols file\n", rc);
}
@@ -910,7 +924,7 @@ static int __init opal_init(void)
}
/* Initialise OPAL messaging system */
- opal_message_init();
+ opal_message_init(opal_node);
/* Initialise OPAL asynchronous completion interface */
opal_async_comp_init();
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index e28f03e1eb5e..a0b9c0c23ed2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
struct page *tce_mem = NULL;
__be64 *addr;
- tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+ tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+ shift - PAGE_SHIFT);
if (!tce_mem) {
pr_err("Failed to allocate a TCE memory, level shift=%d\n",
shift);
@@ -48,6 +49,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
return addr;
}
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+ unsigned long size, unsigned int levels);
+
static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
{
__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +61,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
while (level) {
int n = (idx & mask) >> (level * shift);
- unsigned long tce;
+ unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
- if (tmp[n] == 0) {
+ if (!tce) {
__be64 *tmp2;
if (!alloc)
@@ -70,10 +74,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
if (!tmp2)
return NULL;
- tmp[n] = cpu_to_be64(__pa(tmp2) |
- TCE_PCI_READ | TCE_PCI_WRITE);
+ tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+ oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+ cpu_to_be64(tce)));
+ if (oldtce) {
+ pnv_pci_ioda2_table_do_free_pages(tmp2,
+ ilog2(tbl->it_level_size) + 3, 1);
+ tce = oldtce;
+ }
}
- tce = be64_to_cpu(tmp[n]);
tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
idx &= ~mask;
@@ -161,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
if (ptce)
*ptce = cpu_to_be64(0);
+ else
+ /* Skip the rest of the level */
+ i |= tbl->it_level_size - 1;
}
}
@@ -260,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
PAGE_SHIFT);
const unsigned long tce_table_size = 1UL << table_shift;
- unsigned int tmplevels = levels;
if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
return -EINVAL;
@@ -268,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
if (!is_power_of_2(window_size))
return -EINVAL;
- if (alloc_userspace_copy && (window_size > (1ULL << 32)))
- tmplevels = 1;
-
/* Adjust direct table size from window_size and levels */
entries_shift = (entries_shift + levels - 1) / levels;
level_shift = entries_shift + 3;
@@ -281,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
/* Allocate TCE table */
addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
- tmplevels, tce_table_size, &offset, &total_allocated);
+ 1, tce_table_size, &offset, &total_allocated);
/* addr==NULL means that the first level allocation failed */
if (!addr)
@@ -292,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
* we did not allocate as much as we wanted,
* release partially allocated table.
*/
- if (tmplevels == levels && offset < tce_table_size)
+ if (levels == 1 && offset < tce_table_size)
goto free_tces_exit;
/* Allocate userspace view of the TCE table */
if (alloc_userspace_copy) {
offset = 0;
uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
- tmplevels, tce_table_size, &offset,
+ 1, tce_table_size, &offset,
&total_allocated_uas);
if (!uas)
goto free_tces_exit;
- if (tmplevels == levels && (offset < tce_table_size ||
+ if (levels == 1 && (offset < tce_table_size ||
total_allocated_uas != total_allocated))
goto free_uas_exit;
}
@@ -318,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
window_size, tce_table_size, bus_offset, tbl->it_base,
- tbl->it_userspace, tmplevels, levels);
+ tbl->it_userspace, 1, levels);
return 0;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8080558d020..c28d0d9b7ee0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1939,26 +1939,12 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
}
#ifdef CONFIG_IOMMU_API
-static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
+/* Common for IODA1 and IODA2 */
+static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction,
+ bool realmode)
{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
- if (!ret)
- pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
-
- return ret;
-}
-
-static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
-{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
- if (!ret)
- pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
-
- return ret;
+ return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
}
#endif
@@ -1973,8 +1959,8 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
static struct iommu_table_ops pnv_ioda1_iommu_ops = {
.set = pnv_ioda1_tce_build,
#ifdef CONFIG_IOMMU_API
- .exchange = pnv_ioda1_tce_xchg,
- .exchange_rm = pnv_ioda1_tce_xchg_rm,
+ .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+ .tce_kill = pnv_pci_p7ioc_tce_invalidate,
.useraddrptr = pnv_tce_useraddrptr,
#endif
.clear = pnv_ioda1_tce_free,
@@ -2103,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
return ret;
}
-#ifdef CONFIG_IOMMU_API
-static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
-{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
- if (!ret)
- pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
-
- return ret;
-}
-
-static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
-{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
- if (!ret)
- pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
-
- return ret;
-}
-#endif
-
static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
long npages)
{
@@ -2138,8 +2100,8 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build,
#ifdef CONFIG_IOMMU_API
- .exchange = pnv_ioda2_tce_xchg,
- .exchange_rm = pnv_ioda2_tce_xchg_rm,
+ .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+ .tce_kill = pnv_pci_ioda2_tce_invalidate,
.useraddrptr = pnv_tce_useraddrptr,
#endif
.clear = pnv_ioda2_tce_free,
@@ -2303,7 +2265,7 @@ found:
tbl->it_ops = &pnv_ioda1_iommu_ops;
pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
- iommu_init_table(tbl, phb->hose->node);
+ iommu_init_table(tbl, phb->hose->node, 0, 0);
if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe->pbus);
@@ -2420,6 +2382,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
{
struct iommu_table *tbl = NULL;
long rc;
+ unsigned long res_start, res_end;
/*
* crashkernel= specifies the kdump kernel's maximum memory at
@@ -2433,19 +2396,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
* DMA window can be larger than available memory, which will
* cause errors later.
*/
- const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
+ const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
+
+ /*
+ * We create the default window as big as we can. The constraint is
+ * the max order of allocation possible. The TCE table is likely to
+ * end up being multilevel and with on-demand allocation in place,
+ * the initial use is not going to be huge as the default window aims
+ * to support crippled devices (i.e. not fully 64bit DMAble) only.
+ */
+ /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
+ const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
+ /* Each TCE level cannot exceed maxblock so go multilevel if needed */
+ unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
+ unsigned long tcelevel_order = ilog2(maxblock >> 3);
+ unsigned int levels = tces_order / tcelevel_order;
+
+ if (tces_order % tcelevel_order)
+ levels += 1;
+ /*
+ * We try to stick to default levels (which is >1 at the moment) in
+ * order to save memory by relying on on-demain TCE level allocation.
+ */
+ levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
- rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
- IOMMU_PAGE_SHIFT_4K,
- window_size,
- POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
+ rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
+ window_size, levels, false, &tbl);
if (rc) {
pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
rc);
return rc;
}
- iommu_init_table(tbl, pe->phb->hose->node);
+ /* We use top part of 32bit space for MMIO so exclude it from DMA */
+ res_start = 0;
+ res_end = 0;
+ if (window_size > pe->phb->ioda.m32_pci_base) {
+ res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
+ res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
+ }
+ iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
if (rc) {
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 6104418c9ad5..2825d004dece 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
break;
}
- if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) {
+ if (!of_device_is_compatible(parent, "ibm,ioda2-phb") &&
+ !of_device_is_compatible(parent, "ibm,ioda3-phb")) {
of_node_put(parent);
continue;
}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 469c24463247..f914f0b14e4e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach(
struct pnv_ioda_pe *pe);
/* pci-ioda-tce.c */
-#define POWERNV_IOMMU_DEFAULT_LEVELS 1
+#define POWERNV_IOMMU_DEFAULT_LEVELS 2
#define POWERNV_IOMMU_MAX_LEVELS 5
extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index fd4a1c5a6369..1aa51c4fa904 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -30,4 +30,9 @@ extern void opal_event_shutdown(void);
bool cpu_core_split_required(void);
+struct memcons;
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
+u32 memcons_get_size(struct memcons *mc);
+struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name);
+
#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index a5e52f9eed3c..83498604d322 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -24,6 +24,7 @@
#include <linux/bug.h>
#include <linux/pci.h>
#include <linux/cpufreq.h>
+#include <linux/memblock.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
@@ -166,6 +167,14 @@ static void __init pnv_init(void)
else
#endif
add_preferred_console("hvc", 0, NULL);
+
+ if (!radix_enabled()) {
+ int i;
+
+ /* Allocate per cpu area to save old slb contents during MCE */
+ for_each_possible_cpu(i)
+ paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i));
+ }
}
static void __init pnv_init_IRQ(void)
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 94cd96b9b7bb..fbd6e6b7bbf2 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -193,7 +193,7 @@ static void pnv_smp_cpu_kill_self(void)
* for coming online, which are handled via
* generic_check_cpu_restart() calls.
*/
- kvmppc_set_host_ipi(cpu, 0);
+ kvmppc_clear_host_ipi(cpu);
srr1 = pnv_cpu_offline(cpu);
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
new file mode 100644
index 000000000000..e4a00ad06f9d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ultravisor high level interfaces
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/of_fdt.h>
+#include <linux/of.h>
+
+#include <asm/ultravisor.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static struct kobject *ultravisor_kobj;
+
+int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+ int depth, void *data)
+{
+ if (!of_flat_dt_is_compatible(node, "ibm,ultravisor"))
+ return 0;
+
+ powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR;
+ pr_debug("Ultravisor detected!\n");
+ return 1;
+}
+
+static struct memcons *uv_memcons;
+
+static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *to,
+ loff_t pos, size_t count)
+{
+ return memcons_copy(uv_memcons, to, pos, count);
+}
+
+static struct bin_attribute uv_msglog_attr = {
+ .attr = {.name = "msglog", .mode = 0400},
+ .read = uv_msglog_read
+};
+
+static int __init uv_init(void)
+{
+ struct device_node *node;
+
+ if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+ return 0;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+ if (!node)
+ return -ENODEV;
+
+ uv_memcons = memcons_init(node, "memcons");
+ if (!uv_memcons)
+ return -ENOENT;
+
+ uv_msglog_attr.size = memcons_get_size(uv_memcons);
+
+ ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj);
+ if (!ultravisor_kobj)
+ return -ENOMEM;
+
+ return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr);
+}
+machine_subsys_initcall(powernv, uv_init);
diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c
index bdaeaecdc06b..1193c294b8d0 100644
--- a/arch/powerpc/platforms/ps3/spu.c
+++ b/arch/powerpc/platforms/ps3/spu.c
@@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu)
* setup_areas - Map the spu regions into the address space.
*
* The current HV requires the spu shadow regs to be mapped with the
- * PTE page protection bits set as read-only (PP=3). This implementation
- * uses the low level __ioremap() to bypass the page protection settings
- * inforced by ioremap_prot() to get the needed PTE bits set for the
- * shadow regs.
+ * PTE page protection bits set as read-only.
*/
static int __init setup_areas(struct spu *spu)
@@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu)
struct table {char* name; unsigned long addr; unsigned long size;};
unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
- spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr,
- sizeof(struct spe_shadow),
- shadow_flags);
+ spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr,
+ sizeof(struct spe_shadow), shadow_flags);
if (!spu_pdata(spu)->shadow) {
pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__);
goto fail_ioremap;
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index 98410119c47b..3542b7bd6a46 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -686,20 +686,16 @@ static int ps3_dma_supported(struct device *_dev, u64 mask)
return mask >= DMA_BIT_MASK(32);
}
-static u64 ps3_dma_get_required_mask(struct device *_dev)
-{
- return DMA_BIT_MASK(32);
-}
-
static const struct dma_map_ops ps3_sb_dma_ops = {
.alloc = ps3_alloc_coherent,
.free = ps3_free_coherent,
.map_sg = ps3_sb_map_sg,
.unmap_sg = ps3_sb_unmap_sg,
.dma_supported = ps3_dma_supported,
- .get_required_mask = ps3_dma_get_required_mask,
.map_page = ps3_sb_map_page,
.unmap_page = ps3_unmap_page,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
};
static const struct dma_map_ops ps3_ioc0_dma_ops = {
@@ -708,9 +704,10 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = {
.map_sg = ps3_ioc0_map_sg,
.unmap_sg = ps3_ioc0_unmap_sg,
.dma_supported = ps3_dma_supported,
- .get_required_mask = ps3_dma_get_required_mask,
.map_page = ps3_ioc0_map_page,
.unmap_page = ps3_unmap_page,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
};
/**
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f7b484f55553..9e35cddddf73 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,3 +145,17 @@ config PAPR_SCM
tristate "Support for the PAPR Storage Class Memory interface"
help
Enable access to hypervisor provided storage class memory.
+
+config PPC_SVM
+ bool "Secure virtual machine (SVM) support for POWER"
+ depends on PPC_PSERIES
+ select SWIOTLB
+ select ARCH_HAS_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ help
+ There are certain POWER platforms which support secure guests using
+ the Protected Execution Facility, with the help of an Ultravisor
+ executing below the hypervisor layer. This enables support for
+ those guests.
+
+ If unsure, say "N".
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index ab3d59aeacca..a3c74a5cf20d 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -26,6 +26,8 @@ obj-$(CONFIG_IBMVIO) += vio.o
obj-$(CONFIG_IBMEBUS) += ibmebus.o
obj-$(CONFIG_PAPR_SCM) += papr_scm.o
obj-$(CONFIG_PPC_SPLPAR) += vphn.o
+obj-$(CONFIG_PPC_SVM) += svm.o
+obj-$(CONFIG_FA_DUMP) += rtas-fadump.o
ifdef CONFIG_PPC_PSERIES
obj-$(CONFIG_SUSPEND) += suspend.o
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 9edae1863e2f..893ba3f562c4 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -42,42 +42,44 @@ static int ibm_get_config_addr_info;
static int ibm_get_config_addr_info2;
static int ibm_configure_pe;
-#ifdef CONFIG_PCI_IOV
void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
{
struct pci_dn *pdn = pci_get_pdn(pdev);
- struct pci_dn *physfn_pdn;
- struct eeh_dev *edev;
- if (!pdev->is_virtfn)
+ if (eeh_has_flag(EEH_FORCE_DISABLED))
return;
- pdn->device_id = pdev->device;
- pdn->vendor_id = pdev->vendor;
- pdn->class_code = pdev->class;
- /*
- * Last allow unfreeze return code used for retrieval
- * by user space in eeh-sysfs to show the last command
- * completion from platform.
- */
- pdn->last_allow_rc = 0;
- physfn_pdn = pci_get_pdn(pdev->physfn);
- pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index];
- edev = pdn_to_eeh_dev(pdn);
+ dev_dbg(&pdev->dev, "EEH: Setting up device\n");
+#ifdef CONFIG_PCI_IOV
+ if (pdev->is_virtfn) {
+ struct pci_dn *physfn_pdn;
- /*
- * The following operations will fail if VF's sysfs files
- * aren't created or its resources aren't finalized.
- */
+ pdn->device_id = pdev->device;
+ pdn->vendor_id = pdev->vendor;
+ pdn->class_code = pdev->class;
+ /*
+ * Last allow unfreeze return code used for retrieval
+ * by user space in eeh-sysfs to show the last command
+ * completion from platform.
+ */
+ pdn->last_allow_rc = 0;
+ physfn_pdn = pci_get_pdn(pdev->physfn);
+ pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index];
+ }
+#endif
eeh_add_device_early(pdn);
eeh_add_device_late(pdev);
- edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
- eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
- eeh_add_to_parent_pe(edev); /* Add as VF PE type */
- eeh_sysfs_add_device(pdev);
+#ifdef CONFIG_PCI_IOV
+ if (pdev->is_virtfn) {
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-}
+ edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+ eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */
+ eeh_add_to_parent_pe(edev); /* Add as VF PE type */
+ }
#endif
+ eeh_sysfs_add_device(pdev);
+}
/*
* Buffer for reporting slot-error-detail rtas calls. Its here
@@ -144,10 +146,8 @@ static int pseries_eeh_init(void)
/* Set EEH probe mode */
eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
-#ifdef CONFIG_PCI_IOV
/* Set EEH machine dependent code */
ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
-#endif
return 0;
}
@@ -251,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
+ eeh_edev_dbg(edev, "Probing device\n");
+
/*
* Update class code and mode of eeh device. We need
* correctly reflects that current device is root port
@@ -280,8 +282,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
/* Enable EEH on the device */
+ eeh_edev_dbg(edev, "Enabling EEH on device\n");
ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
- if (!ret) {
+ if (ret) {
+ eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
+ } else {
/* Retrieve PE address */
edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
pe.addr = edev->pe_config_addr;
@@ -297,11 +302,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
if (enable) {
eeh_add_flag(EEH_ENABLED);
eeh_add_to_parent_pe(edev);
-
- pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
- __func__, pdn->busno, PCI_SLOT(pdn->devfn),
- PCI_FUNC(pdn->devfn), pe.phb->global_number,
- pe.addr);
} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
(pdn_to_eeh_dev(pdn->parent))->pe) {
/* This device doesn't support EEH, but it may have an
@@ -310,6 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
eeh_add_to_parent_pe(edev);
}
+ eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
+ (enable ? "enabled" : "unsupported"), ret);
}
/* Save memory bars */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 46d0d35b9ca4..8e700390f3d6 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
switch (hp_elog->action) {
case PSERIES_HP_ELOG_ACTION_ADD:
- if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+ switch (hp_elog->id_type) {
+ case PSERIES_HP_ELOG_ID_DRC_COUNT:
count = hp_elog->_drc_u.drc_count;
rc = dlpar_memory_add_by_count(count);
- } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+ break;
+ case PSERIES_HP_ELOG_ID_DRC_INDEX:
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_add_by_index(drc_index);
- } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+ break;
+ case PSERIES_HP_ELOG_ID_DRC_IC:
count = hp_elog->_drc_u.ic.count;
drc_index = hp_elog->_drc_u.ic.index;
rc = dlpar_memory_add_by_ic(count, drc_index);
- } else {
+ break;
+ default:
rc = -EINVAL;
+ break;
}
break;
case PSERIES_HP_ELOG_ACTION_REMOVE:
- if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
+ switch (hp_elog->id_type) {
+ case PSERIES_HP_ELOG_ID_DRC_COUNT:
count = hp_elog->_drc_u.drc_count;
rc = dlpar_memory_remove_by_count(count);
- } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
+ break;
+ case PSERIES_HP_ELOG_ID_DRC_INDEX:
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_remove_by_index(drc_index);
- } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) {
+ break;
+ case PSERIES_HP_ELOG_ID_DRC_IC:
count = hp_elog->_drc_u.ic.count;
drc_index = hp_elog->_drc_u.ic.index;
rc = dlpar_memory_remove_by_ic(count, drc_index);
- } else {
+ break;
+ default:
rc = -EINVAL;
+ break;
}
break;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 889dc2e44b89..6ba081dd61c9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
#include <asm/udbg.h>
#include <asm/mmzone.h>
#include <asm/plpar_wrappers.h>
+#include <asm/svm.h>
#include "pseries.h"
@@ -609,7 +610,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
iommu_table_setparms(pci->phb, dn, tbl);
tbl->it_ops = &iommu_table_pseries_ops;
- iommu_init_table(tbl, pci->phb->node);
+ iommu_init_table(tbl, pci->phb->node, 0, 0);
/* Divide the rest (1.75GB) among the children */
pci->phb->dma_window_size = 0x80000000ul;
@@ -621,7 +622,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
#ifdef CONFIG_IOMMU_API
static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
- long *tce, enum dma_data_direction *direction)
+ long *tce, enum dma_data_direction *direction,
+ bool realmode)
{
long rc;
unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
@@ -649,7 +651,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
struct iommu_table_ops iommu_table_lpar_multi_ops = {
.set = tce_buildmulti_pSeriesLP,
#ifdef CONFIG_IOMMU_API
- .exchange = tce_exchange_pseries,
+ .xchg_no_kill = tce_exchange_pseries,
#endif
.clear = tce_freemulti_pSeriesLP,
.get = tce_get_pSeriesLP
@@ -690,7 +692,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
ppci->table_group, dma_window);
tbl->it_ops = &iommu_table_lpar_multi_ops;
- iommu_init_table(tbl, ppci->phb->node);
+ iommu_init_table(tbl, ppci->phb->node, 0, 0);
iommu_register_group(ppci->table_group,
pci_domain_nr(bus), 0);
pr_debug(" created table: %p\n", ppci->table_group);
@@ -719,7 +721,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
tbl = PCI_DN(dn)->table_group->tables[0];
iommu_table_setparms(phb, dn, tbl);
tbl->it_ops = &iommu_table_pseries_ops;
- iommu_init_table(tbl, phb->node);
+ iommu_init_table(tbl, phb->node, 0, 0);
set_iommu_table_base(&dev->dev, tbl);
return;
}
@@ -1169,7 +1171,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
iommu_table_setparms_lpar(pci->phb, pdn, tbl,
pci->table_group, dma_window);
tbl->it_ops = &iommu_table_lpar_multi_ops;
- iommu_init_table(tbl, pci->phb->node);
+ iommu_init_table(tbl, pci->phb->node, 0, 0);
iommu_register_group(pci->table_group,
pci_domain_nr(pci->phb->bus), 0);
pr_debug(" created table: %p\n", pci->table_group);
@@ -1318,7 +1320,15 @@ void iommu_init_early_pSeries(void)
of_reconfig_notifier_register(&iommu_reconfig_nb);
register_memory_notifier(&iommu_mem_nb);
- set_pci_dma_ops(&dma_iommu_ops);
+ /*
+ * Secure guest memory is inacessible to devices so regular DMA isn't
+ * possible.
+ *
+ * In that case keep devices' dma_map_ops as NULL so that the generic
+ * DMA code path will use SWIOTLB to bounce buffers for DMA.
+ */
+ if (!is_secure_guest())
+ set_pci_dma_ops(&dma_iommu_ops);
}
static int __init disable_multitce(char *str)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 09bb878c21e0..f87a5c64e24d 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -56,6 +56,22 @@ EXPORT_SYMBOL(plpar_hcall);
EXPORT_SYMBOL(plpar_hcall9);
EXPORT_SYMBOL(plpar_hcall_norets);
+/*
+ * H_BLOCK_REMOVE supported block size for this page size in segment who's base
+ * page size is that page size.
+ *
+ * The first index is the segment base page size, the second one is the actual
+ * page size.
+ */
+static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
+
+/*
+ * Due to the involved complexity, and that the current hypervisor is only
+ * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
+ * buffer size to 8 size block.
+ */
+#define HBLKRM_SUPPORTED_BLOCK_SIZE 8
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static u8 dtl_mask = DTL_LOG_PREEMPT;
#else
@@ -984,6 +1000,17 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL
#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL
+/*
+ * Returned true if we are supporting this block size for the specified segment
+ * base page size and actual page size.
+ *
+ * Currently, we only support 8 size block.
+ */
+static inline bool is_supported_hlbkrm(int bpsize, int psize)
+{
+ return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
+}
+
/**
* H_BLOCK_REMOVE caller.
* @idx should point to the latest @param entry set with a PTEX.
@@ -1143,7 +1170,8 @@ static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
- if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
+ /* Assuming THP size is 16M */
+ if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
hugepage_block_invalidate(slot, vpn, count, psize, ssize);
else
hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
@@ -1312,6 +1340,140 @@ static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
}
/*
+ * TLB Block Invalidate Characteristics
+ *
+ * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
+ * is able to process for each couple segment base page size, actual page size.
+ *
+ * The ibm,get-system-parameter properties is returning a buffer with the
+ * following layout:
+ *
+ * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
+ * -----------------
+ * TLB Block Invalidate Specifiers:
+ * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
+ * [ 1 byte Number of page sizes (N) that are supported for the specified
+ * TLB invalidate block size ]
+ * [ 1 byte Encoded segment base page size and actual page size
+ * MSB=0 means 4k segment base page size and actual page size
+ * MSB=1 the penc value in mmu_psize_def ]
+ * ...
+ * -----------------
+ * Next TLB Block Invalidate Specifiers...
+ * -----------------
+ * [ 0 ]
+ */
+static inline void set_hblkrm_bloc_size(int bpsize, int psize,
+ unsigned int block_size)
+{
+ if (block_size > hblkrm_size[bpsize][psize])
+ hblkrm_size[bpsize][psize] = block_size;
+}
+
+/*
+ * Decode the Encoded segment base page size and actual page size.
+ * PAPR specifies:
+ * - bit 7 is the L bit
+ * - bits 0-5 are the penc value
+ * If the L bit is 0, this means 4K segment base page size and actual page size
+ * otherwise the penc value should be read.
+ */
+#define HBLKRM_L_MASK 0x80
+#define HBLKRM_PENC_MASK 0x3f
+static inline void __init check_lp_set_hblkrm(unsigned int lp,
+ unsigned int block_size)
+{
+ unsigned int bpsize, psize;
+
+ /* First, check the L bit, if not set, this means 4K */
+ if ((lp & HBLKRM_L_MASK) == 0) {
+ set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
+ return;
+ }
+
+ lp &= HBLKRM_PENC_MASK;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
+ struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ if (def->penc[psize] == lp) {
+ set_hblkrm_bloc_size(bpsize, psize, block_size);
+ return;
+ }
+ }
+ }
+}
+
+#define SPLPAR_TLB_BIC_TOKEN 50
+
+/*
+ * The size of the TLB Block Invalidate Characteristics is variable. But at the
+ * maximum it will be the number of possible page sizes *2 + 10 bytes.
+ * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
+ * (128 bytes) for the buffer to get plenty of space.
+ */
+#define SPLPAR_TLB_BIC_MAXLENGTH 128
+
+void __init pseries_lpar_read_hblkrm_characteristics(void)
+{
+ unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
+ int call_status, len, idx, bpsize;
+
+ if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
+ return;
+
+ spin_lock(&rtas_data_buf_lock);
+ memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+ call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ SPLPAR_TLB_BIC_TOKEN,
+ __pa(rtas_data_buf),
+ RTAS_DATA_BUF_SIZE);
+ memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
+ local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
+ spin_unlock(&rtas_data_buf_lock);
+
+ if (call_status != 0) {
+ pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
+ __FILE__, __func__, call_status);
+ return;
+ }
+
+ /*
+ * The first two (2) bytes of the data in the buffer are the length of
+ * the returned data, not counting these first two (2) bytes.
+ */
+ len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
+ if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
+ pr_warn("%s too large returned buffer %d", __func__, len);
+ return;
+ }
+
+ idx = 2;
+ while (idx < len) {
+ u8 block_shift = local_buffer[idx++];
+ u32 block_size;
+ unsigned int npsize;
+
+ if (!block_shift)
+ break;
+
+ block_size = 1 << block_shift;
+
+ for (npsize = local_buffer[idx++];
+ npsize > 0 && idx < len; npsize--)
+ check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
+ block_size);
+ }
+
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
+ if (hblkrm_size[bpsize][idx])
+ pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
+ bpsize, idx, hblkrm_size[bpsize][idx]);
+}
+
+/*
* Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
* lock.
*/
@@ -1330,7 +1492,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
- if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) {
+ if (is_supported_hlbkrm(batch->psize, batch->psize)) {
do_block_remove(number, batch, param);
goto out;
}
@@ -1413,7 +1575,10 @@ static int pseries_lpar_resize_hpt_commit(void *data)
return 0;
}
-/* Must be called in user context */
+/*
+ * Must be called in process context. The caller must hold the
+ * cpus_lock.
+ */
static int pseries_lpar_resize_hpt(unsigned long shift)
{
struct hpt_resize_state state = {
@@ -1467,7 +1632,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
t1 = ktime_get();
- rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL);
+ rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
+ &state, NULL);
t2 = ktime_get();
@@ -1527,16 +1693,24 @@ void __init hpte_init_pseries(void)
mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range;
mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all;
mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
- register_process_table = pseries_lpar_register_process_table;
if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
+
+ /*
+ * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
+ * to inform the hypervisor that we wish to use the HPT.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ pseries_lpar_register_process_table(0, 0, 0);
}
void radix_init_pseries(void)
{
pr_info("Using radix MMU under hypervisor\n");
- register_process_table = pseries_lpar_register_process_table;
+
+ pseries_lpar_register_process_table(__pa(process_tb),
+ 0, PRTB_SIZE_SHIFT - 12);
}
#ifdef CONFIG_PPC_SMLPAR
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index fe812bebdf5e..b571285f6c14 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -9,6 +9,7 @@
#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
+#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/stat.h>
#include <linux/completion.h>
@@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope)
prop_data += vd;
}
+
+ cond_resched();
}
+
+ cond_resched();
} while (rtas_rc == 1);
of_node_put(dn);
@@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope)
add_dt_node(phandle, drc_index);
break;
}
+
+ cond_resched();
}
}
+
+ cond_resched();
} while (rc == 1);
kfree(rtas_buf);
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index a5ac371a3f06..61883291defc 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -65,29 +65,21 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
cond_resched();
} while (rc == H_BUSY);
- if (rc) {
- /* H_OVERLAP needs a separate error path */
- if (rc == H_OVERLAP)
- return -EBUSY;
-
- dev_err(&p->pdev->dev, "bind err: %lld\n", rc);
- return -ENXIO;
- }
+ if (rc)
+ return rc;
p->bound_addr = saved;
-
- dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res);
-
- return 0;
+ dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
+ return rc;
}
-static int drc_pmem_unbind(struct papr_scm_priv *p)
+static void drc_pmem_unbind(struct papr_scm_priv *p)
{
unsigned long ret[PLPAR_HCALL_BUFSIZE];
uint64_t token = 0;
int64_t rc;
- dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index);
+ dev_dbg(&p->pdev->dev, "unbind drc 0x%x\n", p->drc_index);
/* NB: unbind has the same retry requirements as drc_pmem_bind() */
do {
@@ -110,12 +102,48 @@ static int drc_pmem_unbind(struct papr_scm_priv *p)
if (rc)
dev_err(&p->pdev->dev, "unbind error: %lld\n", rc);
else
- dev_dbg(&p->pdev->dev, "unbind drc %x complete\n",
+ dev_dbg(&p->pdev->dev, "unbind drc 0x%x complete\n",
p->drc_index);
- return rc == H_SUCCESS ? 0 : -ENXIO;
+ return;
}
+static int drc_pmem_query_n_bind(struct papr_scm_priv *p)
+{
+ unsigned long start_addr;
+ unsigned long end_addr;
+ unsigned long ret[PLPAR_HCALL_BUFSIZE];
+ int64_t rc;
+
+
+ rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+ p->drc_index, 0);
+ if (rc)
+ goto err_out;
+ start_addr = ret[0];
+
+ /* Make sure the full region is bound. */
+ rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+ p->drc_index, p->blocks - 1);
+ if (rc)
+ goto err_out;
+ end_addr = ret[0];
+
+ if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size))
+ goto err_out;
+
+ p->bound_addr = start_addr;
+ dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
+ return rc;
+
+err_out:
+ dev_info(&p->pdev->dev,
+ "Failed to query, trying an unbind followed by bind");
+ drc_pmem_unbind(p);
+ return drc_pmem_bind(p);
+}
+
+
static int papr_scm_meta_get(struct papr_scm_priv *p,
struct nd_cmd_get_config_data_hdr *hdr)
{
@@ -436,14 +464,14 @@ static int papr_scm_probe(struct platform_device *pdev)
rc = drc_pmem_bind(p);
/* If phyp says drc memory still bound then force unbound and retry */
- if (rc == -EBUSY) {
- dev_warn(&pdev->dev, "Retrying bind after unbinding\n");
- drc_pmem_unbind(p);
- rc = drc_pmem_bind(p);
- }
+ if (rc == H_OVERLAP)
+ rc = drc_pmem_query_n_bind(p);
- if (rc)
+ if (rc != H_SUCCESS) {
+ dev_err(&p->pdev->dev, "bind err: %d\n", rc);
+ rc = -ENXIO;
goto err;
+ }
/* setup the resource for the newly bound range */
p->res.start = p->bound_addr;
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 1eae1d09980c..722830978639 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void)
pSeries_request_regions();
- eeh_probe_devices();
- eeh_addr_cache_build();
+ eeh_show_enabled();
#ifdef CONFIG_PCI_IOV
ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index a6624d4bd9d0..13fa370a87e4 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -112,5 +112,6 @@ static inline unsigned long cmo_get_page_size(void)
int dlpar_workqueue_init(void);
void pseries_setup_rfi_flush(void);
+void pseries_lpar_read_hblkrm_characteristics(void);
#endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f16fdd0f71f7..3acdcc3bb908 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -76,6 +76,7 @@ struct pseries_mc_errorlog {
#define MC_ERROR_TYPE_UE 0x00
#define MC_ERROR_TYPE_SLB 0x01
#define MC_ERROR_TYPE_ERAT 0x02
+#define MC_ERROR_TYPE_UNKNOWN 0x03
#define MC_ERROR_TYPE_TLB 0x04
#define MC_ERROR_TYPE_D_CACHE 0x05
#define MC_ERROR_TYPE_I_CACHE 0x07
@@ -87,6 +88,9 @@ struct pseries_mc_errorlog {
#define MC_ERROR_UE_LOAD_STORE 3
#define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4
+#define UE_EFFECTIVE_ADDR_PROVIDED 0x40
+#define UE_LOGICAL_ADDR_PROVIDED 0x20
+
#define MC_ERROR_SLB_PARITY 0
#define MC_ERROR_SLB_MULTIHIT 1
#define MC_ERROR_SLB_INDETERMINATE 2
@@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
}
}
-static
-inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog)
-{
- __be64 addr = 0;
-
- switch (mlog->error_type) {
- case MC_ERROR_TYPE_UE:
- if (mlog->sub_err_type & 0x40)
- addr = mlog->effective_address;
- break;
- case MC_ERROR_TYPE_SLB:
- case MC_ERROR_TYPE_ERAT:
- case MC_ERROR_TYPE_TLB:
- if (mlog->sub_err_type & 0x80)
- addr = mlog->effective_address;
- default:
- break;
- }
- return be64_to_cpu(addr);
-}
-
/*
* Enable the hotplug interrupt late because processing them may touch other
* devices or systems (e.g. hugepages) that have not been initialized at the
@@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
}
-#define VAL_TO_STRING(ar, val) \
- (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown")
-static void pseries_print_mce_info(struct pt_regs *regs,
- struct rtas_error_log *errp)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
{
- const char *level, *sevstr;
+ struct mce_error_info mce_err = { 0 };
+ unsigned long eaddr = 0, paddr = 0;
struct pseries_errorlog *pseries_log;
struct pseries_mc_errorlog *mce_log;
- u8 error_type, err_sub_type;
- u64 addr;
- u8 initiator = rtas_error_initiator(errp);
int disposition = rtas_error_disposition(errp);
+ int initiator = rtas_error_initiator(errp);
+ int severity = rtas_error_severity(errp);
+ u8 error_type, err_sub_type;
- static const char * const initiators[] = {
- [0] = "Unknown",
- [1] = "CPU",
- [2] = "PCI",
- [3] = "ISA",
- [4] = "Memory",
- [5] = "Power Mgmt",
- };
- static const char * const mc_err_types[] = {
- [0] = "UE",
- [1] = "SLB",
- [2] = "ERAT",
- [3] = "Unknown",
- [4] = "TLB",
- [5] = "D-Cache",
- [6] = "Unknown",
- [7] = "I-Cache",
- };
- static const char * const mc_ue_types[] = {
- [0] = "Indeterminate",
- [1] = "Instruction fetch",
- [2] = "Page table walk ifetch",
- [3] = "Load/Store",
- [4] = "Page table walk Load/Store",
- };
-
- /* SLB sub errors valid values are 0x0, 0x1, 0x2 */
- static const char * const mc_slb_types[] = {
- [0] = "Parity",
- [1] = "Multihit",
- [2] = "Indeterminate",
- };
-
- /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
- static const char * const mc_soft_types[] = {
- [0] = "Unknown",
- [1] = "Parity",
- [2] = "Multihit",
- [3] = "Indeterminate",
- };
-
- if (!rtas_error_extended(errp)) {
- pr_err("Machine check interrupt: Missing extended error log\n");
- return;
- }
+ if (initiator == RTAS_INITIATOR_UNKNOWN)
+ mce_err.initiator = MCE_INITIATOR_UNKNOWN;
+ else if (initiator == RTAS_INITIATOR_CPU)
+ mce_err.initiator = MCE_INITIATOR_CPU;
+ else if (initiator == RTAS_INITIATOR_PCI)
+ mce_err.initiator = MCE_INITIATOR_PCI;
+ else if (initiator == RTAS_INITIATOR_ISA)
+ mce_err.initiator = MCE_INITIATOR_ISA;
+ else if (initiator == RTAS_INITIATOR_MEMORY)
+ mce_err.initiator = MCE_INITIATOR_MEMORY;
+ else if (initiator == RTAS_INITIATOR_POWERMGM)
+ mce_err.initiator = MCE_INITIATOR_POWERMGM;
+ else
+ mce_err.initiator = MCE_INITIATOR_UNKNOWN;
+
+ if (severity == RTAS_SEVERITY_NO_ERROR)
+ mce_err.severity = MCE_SEV_NO_ERROR;
+ else if (severity == RTAS_SEVERITY_EVENT)
+ mce_err.severity = MCE_SEV_WARNING;
+ else if (severity == RTAS_SEVERITY_WARNING)
+ mce_err.severity = MCE_SEV_WARNING;
+ else if (severity == RTAS_SEVERITY_ERROR_SYNC)
+ mce_err.severity = MCE_SEV_SEVERE;
+ else if (severity == RTAS_SEVERITY_ERROR)
+ mce_err.severity = MCE_SEV_SEVERE;
+ else if (severity == RTAS_SEVERITY_FATAL)
+ mce_err.severity = MCE_SEV_FATAL;
+ else
+ mce_err.severity = MCE_SEV_FATAL;
+
+ if (severity <= RTAS_SEVERITY_ERROR_SYNC)
+ mce_err.sync_error = true;
+ else
+ mce_err.sync_error = false;
+
+ mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
+ mce_err.error_class = MCE_ECLASS_UNKNOWN;
+
+ if (!rtas_error_extended(errp))
+ goto out;
pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
if (pseries_log == NULL)
- return;
+ goto out;
mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-
error_type = mce_log->error_type;
err_sub_type = rtas_mc_error_sub_type(mce_log);
- switch (rtas_error_severity(errp)) {
- case RTAS_SEVERITY_NO_ERROR:
- level = KERN_INFO;
- sevstr = "Harmless";
- break;
- case RTAS_SEVERITY_WARNING:
- level = KERN_WARNING;
- sevstr = "";
- break;
- case RTAS_SEVERITY_ERROR:
- case RTAS_SEVERITY_ERROR_SYNC:
- level = KERN_ERR;
- sevstr = "Severe";
- break;
- case RTAS_SEVERITY_FATAL:
- default:
- level = KERN_ERR;
- sevstr = "Fatal";
- break;
- }
+ switch (mce_log->error_type) {
+ case MC_ERROR_TYPE_UE:
+ mce_err.error_type = MCE_ERROR_TYPE_UE;
+ switch (err_sub_type) {
+ case MC_ERROR_UE_IFETCH:
+ mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
+ break;
+ case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH:
+ mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+ break;
+ case MC_ERROR_UE_LOAD_STORE:
+ mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+ break;
+ case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE:
+ mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+ break;
+ case MC_ERROR_UE_INDETERMINATE:
+ default:
+ mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE;
+ break;
+ }
+ if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
+ eaddr = be64_to_cpu(mce_log->effective_address);
-#ifdef CONFIG_PPC_BOOK3S_64
- /* Display faulty slb contents for SLB errors. */
- if (error_type == MC_ERROR_TYPE_SLB)
- slb_dump_contents(local_paca->mce_faulty_slbs);
-#endif
+ if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+ paddr = be64_to_cpu(mce_log->logical_address);
+ } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+ unsigned long pfn;
- printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
- disposition == RTAS_DISP_FULLY_RECOVERED ?
- "Recovered" : "Not recovered");
- if (user_mode(regs)) {
- printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level,
- regs->nip, current->pid, current->comm);
- } else {
- printk("%s NIP [%016lx]: %pS\n", level, regs->nip,
- (void *)regs->nip);
- }
- printk("%s Initiator: %s\n", level,
- VAL_TO_STRING(initiators, initiator));
+ pfn = addr_to_pfn(regs, eaddr);
+ if (pfn != ULONG_MAX)
+ paddr = pfn << PAGE_SHIFT;
+ }
- switch (error_type) {
- case MC_ERROR_TYPE_UE:
- printk("%s Error type: %s [%s]\n", level,
- VAL_TO_STRING(mc_err_types, error_type),
- VAL_TO_STRING(mc_ue_types, err_sub_type));
break;
case MC_ERROR_TYPE_SLB:
- printk("%s Error type: %s [%s]\n", level,
- VAL_TO_STRING(mc_err_types, error_type),
- VAL_TO_STRING(mc_slb_types, err_sub_type));
+ mce_err.error_type = MCE_ERROR_TYPE_SLB;
+ switch (err_sub_type) {
+ case MC_ERROR_SLB_PARITY:
+ mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY;
+ break;
+ case MC_ERROR_SLB_MULTIHIT:
+ mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+ break;
+ case MC_ERROR_SLB_INDETERMINATE:
+ default:
+ mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+ break;
+ }
+ if (mce_log->sub_err_type & 0x80)
+ eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_ERAT:
+ mce_err.error_type = MCE_ERROR_TYPE_ERAT;
+ switch (err_sub_type) {
+ case MC_ERROR_ERAT_PARITY:
+ mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY;
+ break;
+ case MC_ERROR_ERAT_MULTIHIT:
+ mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+ break;
+ case MC_ERROR_ERAT_INDETERMINATE:
+ default:
+ mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE;
+ break;
+ }
+ if (mce_log->sub_err_type & 0x80)
+ eaddr = be64_to_cpu(mce_log->effective_address);
+ break;
case MC_ERROR_TYPE_TLB:
- printk("%s Error type: %s [%s]\n", level,
- VAL_TO_STRING(mc_err_types, error_type),
- VAL_TO_STRING(mc_soft_types, err_sub_type));
+ mce_err.error_type = MCE_ERROR_TYPE_TLB;
+ switch (err_sub_type) {
+ case MC_ERROR_TLB_PARITY:
+ mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY;
+ break;
+ case MC_ERROR_TLB_MULTIHIT:
+ mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+ break;
+ case MC_ERROR_TLB_INDETERMINATE:
+ default:
+ mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
+ break;
+ }
+ if (mce_log->sub_err_type & 0x80)
+ eaddr = be64_to_cpu(mce_log->effective_address);
+ break;
+ case MC_ERROR_TYPE_D_CACHE:
+ mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
+ case MC_ERROR_TYPE_I_CACHE:
+ mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+ break;
+ case MC_ERROR_TYPE_UNKNOWN:
default:
- printk("%s Error type: %s\n", level,
- VAL_TO_STRING(mc_err_types, error_type));
+ mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
break;
}
- addr = rtas_mc_get_effective_addr(mce_log);
- if (addr)
- printk("%s Effective address: %016llx\n", level, addr);
-}
-
-static int mce_handle_error(struct rtas_error_log *errp)
-{
- struct pseries_errorlog *pseries_log;
- struct pseries_mc_errorlog *mce_log;
- int disposition = rtas_error_disposition(errp);
- u8 error_type;
-
- if (!rtas_error_extended(errp))
- goto out;
-
- pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
- if (pseries_log == NULL)
- goto out;
-
- mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
- error_type = mce_log->error_type;
-
#ifdef CONFIG_PPC_BOOK3S_64
if (disposition == RTAS_DISP_NOT_RECOVERED) {
switch (error_type) {
@@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp)
slb_save_contents(local_paca->mce_faulty_slbs);
flush_and_reload_slb();
disposition = RTAS_DISP_FULLY_RECOVERED;
- rtas_set_disposition_recovered(errp);
break;
default:
break;
}
+ } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+ /* Platform corrected itself but could be degraded */
+ printk(KERN_ERR "MCE: limited recovery, system may "
+ "be degraded\n");
+ disposition = RTAS_DISP_FULLY_RECOVERED;
}
#endif
out:
- return disposition;
-}
-
-#ifdef CONFIG_MEMORY_FAILURE
-
-static DEFINE_PER_CPU(int, rtas_ue_count);
-static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
+ save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
+ &mce_err, regs->nip, eaddr, paddr);
-#define UE_EFFECTIVE_ADDR_PROVIDED 0x40
-#define UE_LOGICAL_ADDR_PROVIDED 0x20
-
-
-static void pseries_hwpoison_work_fn(struct work_struct *work)
-{
- unsigned long paddr;
- int index;
-
- while (__this_cpu_read(rtas_ue_count) > 0) {
- index = __this_cpu_read(rtas_ue_count) - 1;
- paddr = __this_cpu_read(rtas_ue_paddr[index]);
- memory_failure(paddr >> PAGE_SHIFT, 0);
- __this_cpu_dec(rtas_ue_count);
- }
-}
-
-static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
-
-static void queue_ue_paddr(unsigned long paddr)
-{
- int index;
-
- index = __this_cpu_inc_return(rtas_ue_count) - 1;
- if (index >= MAX_MC_EVT) {
- __this_cpu_dec(rtas_ue_count);
- return;
- }
- this_cpu_write(rtas_ue_paddr[index], paddr);
- schedule_work(&hwpoison_work);
-}
-
-static void pseries_do_memory_failure(struct pt_regs *regs,
- struct pseries_mc_errorlog *mce_log)
-{
- unsigned long paddr;
-
- if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
- paddr = be64_to_cpu(mce_log->logical_address);
- } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
- unsigned long pfn;
-
- pfn = addr_to_pfn(regs,
- be64_to_cpu(mce_log->effective_address));
- if (pfn == ULONG_MAX)
- return;
- paddr = pfn << PAGE_SHIFT;
- } else {
- return;
- }
- queue_ue_paddr(paddr);
-}
-
-static void pseries_process_ue(struct pt_regs *regs,
- struct rtas_error_log *errp)
-{
- struct pseries_errorlog *pseries_log;
- struct pseries_mc_errorlog *mce_log;
-
- if (!rtas_error_extended(errp))
- return;
-
- pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
- if (!pseries_log)
- return;
-
- mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-
- if (mce_log->error_type == MC_ERROR_TYPE_UE)
- pseries_do_memory_failure(regs, mce_log);
+ return disposition;
}
-#else
-static inline void pseries_process_ue(struct pt_regs *regs,
- struct rtas_error_log *errp) { }
-#endif /*CONFIG_MEMORY_FAILURE */
/*
* Process MCE rtas errlog event.
@@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work)
* Return 1 if corrected (or delivered a signal).
* Return 0 if there is nothing we can do.
*/
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt)
{
int recovered = 0;
- int disposition = rtas_error_disposition(err);
-
- pseries_print_mce_info(regs, err);
if (!(regs->msr & MSR_RI)) {
/* If MSR_RI isn't set, we cannot recover */
pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
recovered = 0;
-
- } else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
+ } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
/* Platform corrected itself */
recovered = 1;
+ } else if (evt->severity == MCE_SEV_FATAL) {
+ /* Fatal machine check */
+ pr_err("Machine check interrupt is fatal\n");
+ recovered = 0;
+ }
- } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
- /* Platform corrected itself but could be degraded */
- printk(KERN_ERR "MCE: limited recovery, system may "
- "be degraded\n");
- recovered = 1;
-
- } else if (user_mode(regs) && !is_global_init(current) &&
- rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
-
+ if (!recovered && evt->sync_error) {
/*
- * If we received a synchronous error when in userspace
- * kill the task. Firmware may report details of the fail
- * asynchronously, so we can't rely on the target and type
- * fields being valid here.
+ * Try to kill processes if we get a synchronous machine check
+ * (e.g., one caused by execution of this instruction). This
+ * will devolve into a panic if we try to kill init or are in
+ * an interrupt etc.
+ *
+ * TODO: Queue up this address for hwpoisioning later.
+ * TODO: This is not quite right for d-side machine
+ * checks ->nip is not necessarily the important
+ * address.
*/
- printk(KERN_ERR "MCE: uncorrectable error, killing task "
- "%s:%d\n", current->comm, current->pid);
-
- _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
- recovered = 1;
+ if ((user_mode(regs))) {
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ } else if (die_will_crash()) {
+ /*
+ * die() would kill the kernel, so better to go via
+ * the platform reboot code that will log the
+ * machine check.
+ */
+ recovered = 0;
+ } else {
+ die("Machine check", regs, SIGBUS);
+ recovered = 1;
+ }
}
- pseries_process_ue(regs, err);
-
- /* Queue irq work to log this rtas event later. */
- irq_work_queue(&mce_errlog_process_work);
-
return recovered;
}
@@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
*/
int pSeries_machine_check_exception(struct pt_regs *regs)
{
- struct rtas_error_log *errp;
+ struct machine_check_event evt;
- if (fwnmi_active) {
- fwnmi_release_errinfo();
- errp = fwnmi_get_errlog();
- if (errp && recover_mce(regs, errp))
- return 1;
+ if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+ return 0;
+
+ /* Print things out */
+ if (evt.version != MCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt.version);
+ return 0;
}
+ machine_check_print_event_info(&evt, user_mode(regs), false);
+
+ if (recover_mce(regs, &evt))
+ return 1;
return 0;
}
@@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs)
* to panic. Hence we will call it as soon as we go into
* virtual mode.
*/
- disposition = mce_handle_error(errp);
+ disposition = mce_handle_error(regs, errp);
+ fwnmi_release_errinfo();
+
+ /* Queue irq work to log this rtas event later. */
+ irq_work_queue(&mce_errlog_process_work);
+
if (disposition == RTAS_DISP_FULLY_RECOVERED)
return 1;
}
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c
new file mode 100644
index 000000000000..70c3013fdd07
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWERVM platform.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "rtas fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+#include <asm/fadump-internal.h>
+
+#include "rtas-fadump.h"
+
+static struct rtas_fadump_mem_struct fdm;
+static const struct rtas_fadump_mem_struct *fdm_active;
+
+static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
+ const struct rtas_fadump_mem_struct *fdm)
+{
+ fadump_conf->boot_mem_dest_addr =
+ be64_to_cpu(fdm->rmr_region.destination_address);
+
+ fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr +
+ fadump_conf->boot_memory_size);
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * setup in the first kernel and passed to the f/w.
+ */
+static void rtas_fadump_get_config(struct fw_dump *fadump_conf,
+ const struct rtas_fadump_mem_struct *fdm)
+{
+ fadump_conf->boot_mem_addr[0] =
+ be64_to_cpu(fdm->rmr_region.source_address);
+ fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len);
+ fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0];
+
+ fadump_conf->boot_mem_top = fadump_conf->boot_memory_size;
+ fadump_conf->boot_mem_regs_cnt = 1;
+
+ /*
+ * Start address of reserve dump area (permanent reservation) for
+ * re-registering FADump after dump capture.
+ */
+ fadump_conf->reserve_dump_area_start =
+ be64_to_cpu(fdm->cpu_state_data.destination_address);
+
+ rtas_fadump_update_config(fadump_conf, fdm);
+}
+
+static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+ u64 addr = fadump_conf->reserve_dump_area_start;
+
+ memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct));
+ addr = addr & PAGE_MASK;
+
+ fdm.header.dump_format_version = cpu_to_be32(0x00000001);
+ fdm.header.dump_num_sections = cpu_to_be16(3);
+ fdm.header.dump_status_flag = 0;
+ fdm.header.offset_first_dump_section =
+ cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct,
+ cpu_state_data));
+
+ /*
+ * Fields for disk dump option.
+ * We are not using disk dump option, hence set these fields to 0.
+ */
+ fdm.header.dd_block_size = 0;
+ fdm.header.dd_block_offset = 0;
+ fdm.header.dd_num_blocks = 0;
+ fdm.header.dd_offset_disk_path = 0;
+
+ /* set 0 to disable an automatic dump-reboot. */
+ fdm.header.max_time_auto = 0;
+
+ /* Kernel dump sections */
+ /* cpu state data section. */
+ fdm.cpu_state_data.request_flag =
+ cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+ fdm.cpu_state_data.source_data_type =
+ cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
+ fdm.cpu_state_data.source_address = 0;
+ fdm.cpu_state_data.source_len =
+ cpu_to_be64(fadump_conf->cpu_state_data_size);
+ fdm.cpu_state_data.destination_address = cpu_to_be64(addr);
+ addr += fadump_conf->cpu_state_data_size;
+
+ /* hpte region section */
+ fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+ fdm.hpte_region.source_data_type =
+ cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
+ fdm.hpte_region.source_address = 0;
+ fdm.hpte_region.source_len =
+ cpu_to_be64(fadump_conf->hpte_region_size);
+ fdm.hpte_region.destination_address = cpu_to_be64(addr);
+ addr += fadump_conf->hpte_region_size;
+
+ /* RMA region section */
+ fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
+ fdm.rmr_region.source_data_type =
+ cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
+ fdm.rmr_region.source_address = cpu_to_be64(0);
+ fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size);
+ fdm.rmr_region.destination_address = cpu_to_be64(addr);
+ addr += fadump_conf->boot_memory_size;
+
+ rtas_fadump_update_config(fadump_conf, &fdm);
+
+ return addr;
+}
+
+static u64 rtas_fadump_get_bootmem_min(void)
+{
+ return RTAS_FADUMP_MIN_BOOT_MEM;
+}
+
+static int rtas_fadump_register(struct fw_dump *fadump_conf)
+{
+ unsigned int wait_time;
+ int rc, err = -EIO;
+
+ /* TODO: Add upper time limit for the delay */
+ do {
+ rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+ NULL, FADUMP_REGISTER, &fdm,
+ sizeof(struct rtas_fadump_mem_struct));
+
+ wait_time = rtas_busy_delay_time(rc);
+ if (wait_time)
+ mdelay(wait_time);
+
+ } while (wait_time);
+
+ switch (rc) {
+ case 0:
+ pr_info("Registration is successful!\n");
+ fadump_conf->dump_registered = 1;
+ err = 0;
+ break;
+ case -1:
+ pr_err("Failed to register. Hardware Error(%d).\n", rc);
+ break;
+ case -3:
+ if (!is_fadump_boot_mem_contiguous())
+ pr_err("Can't have holes in boot memory area.\n");
+ else if (!is_fadump_reserved_mem_contiguous())
+ pr_err("Can't have holes in reserved memory area.\n");
+
+ pr_err("Failed to register. Parameter Error(%d).\n", rc);
+ err = -EINVAL;
+ break;
+ case -9:
+ pr_err("Already registered!\n");
+ fadump_conf->dump_registered = 1;
+ err = -EEXIST;
+ break;
+ default:
+ pr_err("Failed to register. Unknown Error(%d).\n", rc);
+ break;
+ }
+
+ return err;
+}
+
+static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
+{
+ unsigned int wait_time;
+ int rc;
+
+ /* TODO: Add upper time limit for the delay */
+ do {
+ rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+ NULL, FADUMP_UNREGISTER, &fdm,
+ sizeof(struct rtas_fadump_mem_struct));
+
+ wait_time = rtas_busy_delay_time(rc);
+ if (wait_time)
+ mdelay(wait_time);
+ } while (wait_time);
+
+ if (rc) {
+ pr_err("Failed to un-register - unexpected error(%d).\n", rc);
+ return -EIO;
+ }
+
+ fadump_conf->dump_registered = 0;
+ return 0;
+}
+
+static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+ unsigned int wait_time;
+ int rc;
+
+ /* TODO: Add upper time limit for the delay */
+ do {
+ rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
+ NULL, FADUMP_INVALIDATE, fdm_active,
+ sizeof(struct rtas_fadump_mem_struct));
+
+ wait_time = rtas_busy_delay_time(rc);
+ if (wait_time)
+ mdelay(wait_time);
+ } while (wait_time);
+
+ if (rc) {
+ pr_err("Failed to invalidate - unexpected error (%d).\n", rc);
+ return -EIO;
+ }
+
+ fadump_conf->dump_active = 0;
+ fdm_active = NULL;
+ return 0;
+}
+
+#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000
+static inline int rtas_fadump_gpr_index(u64 id)
+{
+ char str[3];
+ int i = -1;
+
+ if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) {
+ /* get the digits at the end */
+ id &= ~RTAS_FADUMP_GPR_MASK;
+ id >>= 24;
+ str[2] = '\0';
+ str[1] = id & 0xff;
+ str[0] = (id >> 8) & 0xff;
+ if (kstrtoint(str, 10, &i))
+ i = -EINVAL;
+ if (i > 31)
+ i = -1;
+ }
+ return i;
+}
+
+void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val)
+{
+ int i;
+
+ i = rtas_fadump_gpr_index(reg_id);
+ if (i >= 0)
+ regs->gpr[i] = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("NIA"))
+ regs->nip = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("MSR"))
+ regs->msr = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("CTR"))
+ regs->ctr = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("LR"))
+ regs->link = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("XER"))
+ regs->xer = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("CR"))
+ regs->ccr = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("DAR"))
+ regs->dar = (unsigned long)reg_val;
+ else if (reg_id == fadump_str_to_u64("DSISR"))
+ regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct rtas_fadump_reg_entry*
+rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry,
+ struct pt_regs *regs)
+{
+ memset(regs, 0, sizeof(struct pt_regs));
+
+ while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
+ rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
+ be64_to_cpu(reg_entry->reg_value));
+ reg_entry++;
+ }
+ reg_entry++;
+ return reg_entry;
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
+{
+ struct rtas_fadump_reg_save_area_header *reg_header;
+ struct fadump_crash_info_header *fdh = NULL;
+ struct rtas_fadump_reg_entry *reg_entry;
+ u32 num_cpus, *note_buf;
+ int i, rc = 0, cpu = 0;
+ struct pt_regs regs;
+ unsigned long addr;
+ void *vaddr;
+
+ addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address);
+ vaddr = __va(addr);
+
+ reg_header = vaddr;
+ if (be64_to_cpu(reg_header->magic_number) !=
+ fadump_str_to_u64("REGSAVE")) {
+ pr_err("Unable to read register save area.\n");
+ return -ENOENT;
+ }
+
+ pr_debug("--------CPU State Data------------\n");
+ pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
+ pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
+
+ vaddr += be32_to_cpu(reg_header->num_cpu_offset);
+ num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
+ pr_debug("NumCpus : %u\n", num_cpus);
+ vaddr += sizeof(u32);
+ reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
+
+ rc = fadump_setup_cpu_notes_buf(num_cpus);
+ if (rc != 0)
+ return rc;
+
+ note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+
+ if (fadump_conf->fadumphdr_addr)
+ fdh = __va(fadump_conf->fadumphdr_addr);
+
+ for (i = 0; i < num_cpus; i++) {
+ if (be64_to_cpu(reg_entry->reg_id) !=
+ fadump_str_to_u64("CPUSTRT")) {
+ pr_err("Unable to read CPU state data\n");
+ rc = -ENOENT;
+ goto error_out;
+ }
+ /* Lower 4 bytes of reg_value contains logical cpu id */
+ cpu = (be64_to_cpu(reg_entry->reg_value) &
+ RTAS_FADUMP_CPU_ID_MASK);
+ if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) {
+ RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
+ continue;
+ }
+ pr_debug("Reading register data for cpu %d...\n", cpu);
+ if (fdh && fdh->crashing_cpu == cpu) {
+ regs = fdh->regs;
+ note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+ RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
+ } else {
+ reg_entry++;
+ reg_entry = rtas_fadump_read_regs(reg_entry, &regs);
+ note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+ }
+ }
+ final_note(note_buf);
+
+ if (fdh) {
+ pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+ fdh->elfcorehdr_addr);
+ fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr));
+ }
+ return 0;
+
+error_out:
+ fadump_free_cpu_notes_buf();
+ return rc;
+
+}
+
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
+{
+ struct fadump_crash_info_header *fdh;
+ int rc = 0;
+
+ if (!fdm_active || !fadump_conf->fadumphdr_addr)
+ return -EINVAL;
+
+ /* Check if the dump data is valid. */
+ if ((be16_to_cpu(fdm_active->header.dump_status_flag) ==
+ RTAS_FADUMP_ERROR_FLAG) ||
+ (fdm_active->cpu_state_data.error_flags != 0) ||
+ (fdm_active->rmr_region.error_flags != 0)) {
+ pr_err("Dump taken by platform is not valid\n");
+ return -EINVAL;
+ }
+ if ((fdm_active->rmr_region.bytes_dumped !=
+ fdm_active->rmr_region.source_len) ||
+ !fdm_active->cpu_state_data.bytes_dumped) {
+ pr_err("Dump taken by platform is incomplete\n");
+ return -EINVAL;
+ }
+
+ /* Validate the fadump crash info header */
+ fdh = __va(fadump_conf->fadumphdr_addr);
+ if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+ pr_err("Crash info header is not valid.\n");
+ return -EINVAL;
+ }
+
+ rc = rtas_fadump_build_cpu_notes(fadump_conf);
+ if (rc)
+ return rc;
+
+ /*
+ * We are done validating dump info and elfcore header is now ready
+ * to be exported. set elfcorehdr_addr so that vmcore module will
+ * export the elfcore header through '/proc/vmcore'.
+ */
+ elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+ return 0;
+}
+
+static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
+ struct seq_file *m)
+{
+ const struct rtas_fadump_section *cpu_data_section;
+ const struct rtas_fadump_mem_struct *fdm_ptr;
+
+ if (fdm_active)
+ fdm_ptr = fdm_active;
+ else
+ fdm_ptr = &fdm;
+
+ cpu_data_section = &(fdm_ptr->cpu_state_data);
+ seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+ be64_to_cpu(cpu_data_section->destination_address),
+ be64_to_cpu(cpu_data_section->destination_address) +
+ be64_to_cpu(cpu_data_section->source_len) - 1,
+ be64_to_cpu(cpu_data_section->source_len),
+ be64_to_cpu(cpu_data_section->bytes_dumped));
+
+ seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
+ be64_to_cpu(fdm_ptr->hpte_region.destination_address),
+ be64_to_cpu(fdm_ptr->hpte_region.destination_address) +
+ be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1,
+ be64_to_cpu(fdm_ptr->hpte_region.source_len),
+ be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped));
+
+ seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+ be64_to_cpu(fdm_ptr->rmr_region.source_address),
+ be64_to_cpu(fdm_ptr->rmr_region.destination_address));
+ seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+ be64_to_cpu(fdm_ptr->rmr_region.source_len),
+ be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));
+
+ /* Dump is active. Show reserved area start address. */
+ if (fdm_active) {
+ seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
+ fadump_conf->reserve_dump_area_start);
+ }
+}
+
+static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
+ const char *msg)
+{
+ /* Call ibm,os-term rtas call to trigger firmware assisted dump */
+ rtas_os_term((char *)msg);
+}
+
+static struct fadump_ops rtas_fadump_ops = {
+ .fadump_init_mem_struct = rtas_fadump_init_mem_struct,
+ .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min,
+ .fadump_register = rtas_fadump_register,
+ .fadump_unregister = rtas_fadump_unregister,
+ .fadump_invalidate = rtas_fadump_invalidate,
+ .fadump_process = rtas_fadump_process,
+ .fadump_region_show = rtas_fadump_region_show,
+ .fadump_trigger = rtas_fadump_trigger,
+};
+
+void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+ int i, size, num_sections;
+ const __be32 *sections;
+ const __be32 *token;
+
+ /*
+ * Check if Firmware Assisted dump is supported. if yes, check
+ * if dump has been initiated on last reboot.
+ */
+ token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+ if (!token)
+ return;
+
+ fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token);
+ fadump_conf->ops = &rtas_fadump_ops;
+ fadump_conf->fadump_supported = 1;
+
+ /* Firmware supports 64-bit value for size, align it to pagesize. */
+ fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE);
+
+ /*
+ * The 'ibm,kernel-dump' rtas node is present only if there is
+ * dump data waiting for us.
+ */
+ fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+ if (fdm_active) {
+ pr_info("Firmware-assisted dump is active.\n");
+ fadump_conf->dump_active = 1;
+ rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active));
+ }
+
+ /* Get the sizes required to store dump data for the firmware provided
+ * dump sections.
+ * For each dump section type supported, a 32bit cell which defines
+ * the ID of a supported section followed by two 32 bit cells which
+ * gives the size of the section in bytes.
+ */
+ sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+ &size);
+
+ if (!sections)
+ return;
+
+ num_sections = size / (3 * sizeof(u32));
+
+ for (i = 0; i < num_sections; i++, sections += 3) {
+ u32 type = (u32)of_read_number(sections, 1);
+
+ switch (type) {
+ case RTAS_FADUMP_CPU_STATE_DATA:
+ fadump_conf->cpu_state_data_size =
+ of_read_ulong(&sections[1], 2);
+ break;
+ case RTAS_FADUMP_HPTE_REGION:
+ fadump_conf->hpte_region_size =
+ of_read_ulong(&sections[1], 2);
+ break;
+ }
+ }
+}
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h
new file mode 100644
index 000000000000..fd59bd7ca9c3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWERVM platform.
+ *
+ * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _PSERIES_RTAS_FADUMP_H
+#define _PSERIES_RTAS_FADUMP_H
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully. When kdump infrastructure is
+ * configured to save vmcore over network, we run into OOM issue while
+ * loading modules related to network setup. Hence we need additional 64M
+ * of memory to avoid OOM issue.
+ */
+#define RTAS_FADUMP_MIN_BOOT_MEM ((0x1UL << 28) + (0x1UL << 26))
+
+/* Firmware provided dump sections */
+#define RTAS_FADUMP_CPU_STATE_DATA 0x0001
+#define RTAS_FADUMP_HPTE_REGION 0x0002
+#define RTAS_FADUMP_REAL_MODE_REGION 0x0011
+
+/* Dump request flag */
+#define RTAS_FADUMP_REQUEST_FLAG 0x00000001
+
+/* Dump status flag */
+#define RTAS_FADUMP_ERROR_FLAG 0x2000
+
+/* Kernel Dump section info */
+struct rtas_fadump_section {
+ __be32 request_flag;
+ __be16 source_data_type;
+ __be16 error_flags;
+ __be64 source_address;
+ __be64 source_len;
+ __be64 bytes_dumped;
+ __be64 destination_address;
+};
+
+/* ibm,configure-kernel-dump header. */
+struct rtas_fadump_section_header {
+ __be32 dump_format_version;
+ __be16 dump_num_sections;
+ __be16 dump_status_flag;
+ __be32 offset_first_dump_section;
+
+ /* Fields for disk dump option. */
+ __be32 dd_block_size;
+ __be64 dd_block_offset;
+ __be64 dd_num_blocks;
+ __be32 dd_offset_disk_path;
+
+ /* Maximum time allowed to prevent an automatic dump-reboot. */
+ __be32 max_time_auto;
+};
+
+/*
+ * Firmware Assisted dump memory structure. This structure is required for
+ * registering future kernel dump with power firmware through rtas call.
+ *
+ * No disk dump option. Hence disk dump path string section is not included.
+ */
+struct rtas_fadump_mem_struct {
+ struct rtas_fadump_section_header header;
+
+ /* Kernel dump sections */
+ struct rtas_fadump_section cpu_state_data;
+ struct rtas_fadump_section hpte_region;
+
+ /*
+ * TODO: Extend multiple boot memory regions support in the kernel
+ * for this platform.
+ */
+ struct rtas_fadump_section rmr_region;
+};
+
+/*
+ * The firmware-assisted dump format.
+ *
+ * The register save area is an area in the partition's memory used to preserve
+ * the register contents (CPU state data) for the active CPUs during a firmware
+ * assisted dump. The dump format contains register save area header followed
+ * by register entries. Each list of registers for a CPU starts with "CPUSTRT"
+ * and ends with "CPUEND".
+ */
+
+/* Register save area header. */
+struct rtas_fadump_reg_save_area_header {
+ __be64 magic_number;
+ __be32 version;
+ __be32 num_cpu_offset;
+};
+
+/* Register entry. */
+struct rtas_fadump_reg_entry {
+ __be64 reg_id;
+ __be64 reg_value;
+};
+
+/* Utility macros */
+#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry) \
+({ \
+ while (be64_to_cpu(reg_entry->reg_id) != \
+ fadump_str_to_u64("CPUEND")) \
+ reg_entry++; \
+ reg_entry++; \
+})
+
+#define RTAS_FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
+
+#endif /* _PSERIES_RTAS_FADUMP_H */
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f5940cc71c37..0a40201f315f 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -69,6 +69,7 @@
#include <asm/security_features.h>
#include <asm/asm-const.h>
#include <asm/swiotlb.h>
+#include <asm/svm.h>
#include "pseries.h"
#include "../../../../drivers/pci/pci.h"
@@ -141,17 +142,19 @@ static void __init fwnmi_init(void)
}
#ifdef CONFIG_PPC_BOOK3S_64
- /* Allocate per cpu slb area to save old slb contents during MCE */
- size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
- slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry),
- MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
- NUMA_NO_NODE);
- if (!slb_ptr)
- panic("Failed to allocate %zu bytes below %pa for slb area\n",
- size, &ppc64_rma_size);
-
- for_each_possible_cpu(i)
- paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
+ if (!radix_enabled()) {
+ /* Allocate per cpu area to save old slb contents during MCE */
+ size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
+ slb_ptr = memblock_alloc_try_nid_raw(size,
+ sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!slb_ptr)
+ panic("Failed to allocate %zu bytes below %pa for slb area\n",
+ size, &ppc64_rma_size);
+
+ for_each_possible_cpu(i)
+ paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
+ }
#endif
}
@@ -297,8 +300,10 @@ static inline int alloc_dispatch_logs(void)
static int alloc_dispatch_log_kmem_cache(void)
{
+ void (*ctor)(void *) = get_dtl_cache_ctor();
+
dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
- DISPATCH_LOG_BYTES, 0, NULL);
+ DISPATCH_LOG_BYTES, 0, ctor);
if (!dtl_cache) {
pr_warn("Failed to create dispatch trace log buffer cache\n");
pr_warn("Stolen time statistics will be unreliable\n");
@@ -316,6 +321,9 @@ static void pseries_lpar_idle(void)
* low power mode by ceding processor to hypervisor
*/
+ if (!prep_irq_for_idle())
+ return;
+
/* Indicate to hypervisor that we are idle. */
get_lppaca()->idle = 1;
@@ -736,6 +744,7 @@ static void __init pSeries_setup_arch(void)
pseries_setup_rfi_flush();
setup_stf_barrier();
+ pseries_lpar_read_hblkrm_characteristics();
/* By default, only probe PCI (can be overridden by rtas_pci) */
pci_add_flags(PCI_PROBE_ONLY);
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 4b3ef8d9c63f..ad61e90032da 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -41,6 +41,7 @@
#include <asm/dbell.h>
#include <asm/plpar_wrappers.h>
#include <asm/code-patching.h>
+#include <asm/svm.h>
#include "pseries.h"
#include "offline_states.h"
@@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void)
{
xics_smp_probe();
- if (cpu_has_feature(CPU_FTR_DBELL))
+ if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest())
smp_ops->cause_ipi = smp_pseries_cause_ipi;
else
smp_ops->cause_ipi = icp_ops->cause_ipi;
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
new file mode 100644
index 000000000000..40c0637203d5
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Secure VM platform
+ *
+ * Copyright 2018 IBM Corporation
+ * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/svm.h>
+#include <asm/swiotlb.h>
+#include <asm/ultravisor.h>
+
+static int __init init_svm(void)
+{
+ if (!is_secure_guest())
+ return 0;
+
+ /* Don't release the SWIOTLB buffer. */
+ ppc_swiotlb_enable = 1;
+
+ /*
+ * Since the guest memory is inaccessible to the host, devices always
+ * need to use the SWIOTLB buffer for DMA even if dma_capable() says
+ * otherwise.
+ */
+ swiotlb_force = SWIOTLB_FORCE;
+
+ /* Share the SWIOTLB buffer with the host. */
+ swiotlb_update_mem_attributes();
+
+ return 0;
+}
+machine_early_initcall(pseries, init_svm);
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+ if (!PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ uv_unshare_page(PHYS_PFN(__pa(addr)), numpages);
+
+ return 0;
+}
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+ if (!PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ uv_share_page(PHYS_PFN(__pa(addr)), numpages);
+
+ return 0;
+}
+
+/* There's one dispatch log per CPU. */
+#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
+
+static struct page *dtl_page_store[NR_DTL_PAGE];
+static long dtl_nr_pages;
+
+static bool is_dtl_page_shared(struct page *page)
+{
+ long i;
+
+ for (i = 0; i < dtl_nr_pages; i++)
+ if (dtl_page_store[i] == page)
+ return true;
+
+ return false;
+}
+
+void dtl_cache_ctor(void *addr)
+{
+ unsigned long pfn = PHYS_PFN(__pa(addr));
+ struct page *page = pfn_to_page(pfn);
+
+ if (!is_dtl_page_shared(page)) {
+ dtl_page_store[dtl_nr_pages] = page;
+ dtl_nr_pages++;
+ WARN_ON(dtl_nr_pages >= NR_DTL_PAGE);
+ uv_share_page(pfn, 1);
+ }
+}
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 6601b9d404dc..79e2287991db 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -605,6 +605,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = {
.unmap_page = vio_dma_iommu_unmap_page,
.dma_supported = dma_iommu_dma_supported,
.get_required_mask = dma_iommu_get_required_mask,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
};
/**
@@ -1191,7 +1193,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
else
tbl->it_ops = &iommu_table_pseries_ops;
- return iommu_init_table(tbl, -1);
+ return iommu_init_table(tbl, -1, 0, 0);
}
/**